### Using various methods to determine scores for different regressors
1. Find and deal with missing values
2. Deal with categorical data if available
3. Establish pipelines and transformers
4. Establish functions to get mean absolute error
5. The techniques to be used are : 
	1. Linear Regression
	2. Descision Tree Regression
	3. Random Forest Regression
	4. Using Cross-Validation
	5. Gradient Boosting
	6. Learn SVM
	7. Use clustering by k-means and then use gradient boosting

### In this project, we'll be using the models to determine or predict the track popularity given the other features in the dataset

In [50]:
#Input data from csv file into a dataframe
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt

df = pd.read_csv("spotify_songs.csv")
df.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,63rPSO264uRjW1X5E6cWv6,Memories (Dillon Francis Remix),2019-12-13,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,1HoSmj2eLcsrR0vE9gThr4,All the Time (Don Diablo Remix),2019-07-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,1nqYsOef1yKKuGOVchbsk6,Call You Mine - The Remixes,2019-07-19,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,7m7vv9wlQ4i0LFuJiE2zsQ,Someone You Loved (Future Humans Remix),2019-03-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052


#### The first thing we do is inspect the dataframe in order to identify the features that can be included in the process

In [51]:
#Inspecting the dataframe
df.dtypes

track_id                     object
track_name                   object
track_artist                 object
track_popularity              int64
track_album_id               object
track_album_name             object
track_album_release_date     object
playlist_name                object
playlist_id                  object
playlist_genre               object
playlist_subgenre            object
danceability                float64
energy                      float64
key                           int64
loudness                    float64
mode                          int64
speechiness                 float64
acousticness                float64
instrumentalness            float64
liveness                    float64
valence                     float64
tempo                       float64
duration_ms                   int64
dtype: object

In [52]:
print(type(df.track_album_release_date[0]))
print(df.energy.max())
print(df.energy.min())
print(df['mode'].value_counts())
print("\n")
print(df.track_artist.value_counts())
print("\n")
print(df.playlist_name.value_counts().head(10))
print(df.track_album_release_date.max())

<class 'str'>
1.0
0.000175
1    18574
0    14259
Name: mode, dtype: int64


Martin Garrix       161
Queen               136
The Chainsmokers    123
David Guetta        110
Don Omar            102
                   ... 
Underworld            1
The Witches           1
Tess Parks            1
Mick Harvey           1
Mat Zo                1
Name: track_artist, Length: 10692, dtype: int64


Indie Poptimism                                                                                  308
2020 Hits & 2019  Hits â€“ Top Global Tracks ðŸ”¥ðŸ”¥ðŸ”¥                                                   247
Permanent Wave                                                                                   244
Hard Rock Workout                                                                                219
Ultimate Indie Presents... Best Indie Tracks of the 2010s                                        198
Fitness Workout Electro | House | Dance | Progressive House                                   

In [53]:
#Drop track_id, track_name,track_album_name(since it will just reinforce the artist and mess up the predictions) track_album_id, playlist_id 
#since they won't provide any significant insight into track_popularity

dropped_features = ['track_id', 'track_name','track_album_name', 'track_album_id', 'playlist_id', 'track_album_release_date']
df.drop(dropped_features, inplace=True, axis=1)

In [54]:
df.head()
# df.dtypes

Unnamed: 0,track_artist,track_popularity,playlist_name,playlist_genre,playlist_subgenre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,Ed Sheeran,66,Pop Remix,pop,dance pop,0.748,0.916,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754
1,Maroon 5,67,Pop Remix,pop,dance pop,0.726,0.815,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600
2,Zara Larsson,70,Pop Remix,pop,dance pop,0.675,0.931,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616
3,The Chainsmokers,60,Pop Remix,pop,dance pop,0.718,0.93,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093
4,Lewis Capaldi,69,Pop Remix,pop,dance pop,0.65,0.833,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052


In [55]:
#We can either drop all rows with missing values (because we have a big enough dataset)
# df.dropna(how='all', axis=0)
# df.head()

#Or use a imputer to fill in the missing values

### Classfication of features
#### Categorical features : 
##### One hot encoding:
track_artist, playlist_genre, playlist_subgenre
#### Numerical features:
danceability                
energy                    
key                           
loudness                
mode                          
speechiness             
acousticness              
instrumentalness        
liveness              
valence            
tempo     
duration_ms                   

In [56]:
#We first segregate the columns into numerical and categorical
X_cat_names = df.select_dtypes(include='object').columns.to_list()
X_num_names = df.select_dtypes(include=['int', 'float64']).columns.to_list()
X_num_names = X_num_names[1:]
X_num_names

['danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'duration_ms']

In [57]:
# Check if 'track_popularity' column exists in the dataframe
from sklearn.model_selection import train_test_split
if 'track_popularity' in df.columns:
    y = df['track_popularity']
else:
    raise ValueError("Column 'track_popularity' does not exist in the dataframe.")

workingcols = X_cat_names+X_num_names
X = df[workingcols]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

In [58]:
#For establishing a pipeline for imputer and one-hot encoder
from sklearn import compose, pipeline, impute, preprocessing

#Declaring req objects
numerical_imputer = impute.SimpleImputer(strategy='constant')
categorical_imputer = impute.SimpleImputer(strategy='most_frequent')
one_hot_encoder = preprocessing.OneHotEncoder(handle_unknown='ignore', )

categorical_transformer_object = pipeline.Pipeline(steps = [
    ('catimp',categorical_imputer),('onehot',one_hot_encoder)])

#Entire preprocessing unit
preprocessor = compose.ColumnTransformer(transformers=[
    ('num',numerical_imputer, X_num_names),
    ('cat', categorical_transformer_object, X_cat_names)])

## Evaluating Different Models and Techniques


In [63]:
#Before evaluating different moedels, we first need to make a function that gives us the mean absolute error
from sklearn import metrics
def get_mae(pipeline, X_train, y_train, y_valid, X_test):
    pipeline.fit(X_train, y_train)
    y_preds = pipeline.predict(X_test)
    mae = metrics.mean_absolute_error(y_valid, y_preds)
    return mae  

def get_rms(y_vals, y_preds):
    rms = metrics.mean_squared_error(y_vals,y_preds)
    return rms

### Descision Tree Regressor

In [65]:
from sklearn.tree import DecisionTreeRegressor

desctree_model = DecisionTreeRegressor(random_state=10)
desc_pipeline = pipeline.Pipeline(steps=[('preprocess', preprocessor), ('model', desctree_model)])

print(get_mae(desc_pipeline, X_train, y_train, y_test, X_test))

19.660118775696667


## Random Forest Regressor

In [67]:
from sklearn.ensemble import RandomForestRegressor

randomtree_model = RandomForestRegressor()
randtree_pipeline = pipeline.Pipeline(steps=[('preprocess', preprocessor), ('model', randomtree_model)])

print(get_mae(randtree_pipeline, X_train, y_train, y_test, X_test))


15.363028738570195


## 