In [165]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm as tqdm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
import time 
import math
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline

In [124]:
data = pd.read_csv('./datasets/data.csv')

In [125]:
df_new = pd.DataFrame(data)


In [128]:
def prepros(df_new):
    df_new = df_new.select_dtypes(np.number)
    features= [col for col in df_new.columns if col not in ['popularity']]
    X = df_new[features]
    y = df_new['popularity']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.1)
    ss = StandardScaler()
    X_train = ss.fit_transform(X_train)
    X_test = ss.transform(X_test)
    
    return X_train, X_test, y_train, y_test, features

In [129]:
X_train, X_test, y_train, y_test, features = prepros(df_new)

## Section 1: Features Selection

In [130]:
high_score = 0
record=[]
nof=0 
score_list =[]
for i in range(1,14):
    model = LinearRegression()
    rfe = RFE(model,n_features_to_select = i)
    X_train_rfe = rfe.fit_transform(X_train, y_train)
    X_test_rfe = rfe.transform(X_test)
    record.append(list(zip(features,rfe.support_,rfe.ranking_)))
    model.fit(X_train_rfe, y_train)
    score = model.score(X_test_rfe, y_test)
    score_list.append(score)
    if(score>high_score):
        high_score = score
        nof = i
        
print("Optimum number of features: %d" %nof)
print("Score with %d features: %f" % (nof, high_score))

Optimum number of features: 11
Score with 11 features: 0.758111


In [133]:
df_record = pd.DataFrame([[tup[2] for tup in rec] for rec in record], columns = features, index=list(range(1,14))).T
df_record

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13
valence,9,8,7,6,5,4,3,2,1,1,1,1,1
year,1,1,1,1,1,1,1,1,1,1,1,1,1
acousticness,3,2,1,1,1,1,1,1,1,1,1,1,1
danceability,5,4,3,2,1,1,1,1,1,1,1,1,1
duration_ms,13,12,11,10,9,8,7,6,5,4,3,2,1
energy,7,6,5,4,3,2,1,1,1,1,1,1,1
explicit,8,7,6,5,4,3,2,1,1,1,1,1,1
instrumentalness,2,1,1,1,1,1,1,1,1,1,1,1,1
key,14,13,12,11,10,9,8,7,6,5,4,3,2
liveness,6,5,4,3,2,1,1,1,1,1,1,1,1


In [143]:
df_select = df_new.drop(columns =['key','duration_ms'])

In [144]:
X_train, X_test, y_train, y_test, features = prepros(df_select)

In [119]:
log = LogisticRegression(max_iter=1000)
log.fit(X_train, y_train)
log.score(X_test, y_test)

0.45675612328606585

In [146]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.7581001273029934

In [117]:
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
dt.score(X_test, y_test)

-0.5027085917772312

In [102]:
rf_model = RandomForestClassifier(max_depth = 20, max_features = 5, min_samples_split = 8, n_estimators = 1200)
rf_model.fit(X_train, y_train)
rf_model.score(X_test, y_test)
# y_rf_pred = rf_model.predict(X_test)
# display(metrics.accuracy_score(y_test, y_rf_pred))

1.0

In [148]:
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
rfr.score(X_test, y_test)

0.8075107461309942

In [166]:
ada_model = AdaBoostRegressor()
ada_model.fit(X_train, y_train)
ada_model.score(X_test, y_test)

0.6908683029795466

## Selection 2: Applying Regression Models

In [167]:
steps =  [ #list of pipeline steps for each model  
    [('lr', LinearRegression())],
    [('rfr',RandomForestRegressor(max_depth=20, n_estimators= 300, random_state=0))],
    [('dtr',DecisionTreeRegressor(random_state=0))],
    [('abr',AdaBoostRegressor(random_state=1))]
    ]



pipe_titles = ['lr', 'rfr', 'dtr', 'abr']

grid_results = pd.DataFrame(columns=['step','train_accuracy', 'test_accuracy', 'r2_score', 
                                     'mean_squared_error','mean_absolute_error'])

In [168]:
for i in (range(len(steps))):          
    pipe = Pipeline(steps=steps[i])         # configure pipeline for each model
    grid = pipe
    model_results = {}
    
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    
    print('Step: ',pipe_titles[i])
    model_results['step'] = pipe_titles[i]


    #Score the model on training set
    print(f"training score: {grid.score(X_train, y_train)}", '\n')
    model_results['train_accuracy'] = grid.score(X_train, y_train)
    
    #Score the model on test set
    print(f"test score: {grid.score(X_test, y_test)}", '\n')
    model_results['test_accuracy'] = grid.score(X_test, y_test)
    
    #R2 score the model on test set
    print(f"R2 score: {r2_score(y_test, y_pred)}", '\n')
    model_results['r2_score'] = r2_score(y_test, y_pred)
    
    #mean_squared_error the model on test set
    print(f"mean_squared_error: {mean_squared_error(y_test, y_pred)}", '\n')
    model_results['mean_squared_error'] = mean_squared_error(y_test, y_pred)
    
    #mean_absolute_error the model on test set
    print(f"mean_absolute_error: {mean_absolute_error(y_test, y_pred)}", '\n')
    model_results['mean_absolute_error'] = mean_absolute_error(y_test, y_pred)


#     # Display the confusion matrix results showing true/false positive/negative
#     tn, fp, fn, tp = confusion_matrix(y_test, grid.predict(X_test)).ravel() 

#     print("True Negatives: %s" % tn)
#     model_results['tn'] = tn

#     print("False Positives: %s" % fp)  
#     model_results['fp'] = fp

#     print("False Negatives: %s" % fn)
#     model_results['fn'] = fn

#     print("True Positives: %s" % tp, '\n')
#     model_results['tp'] = tp

    grid_results = grid_results.append(model_results, ignore_index=True)

Step:  lr
training score: 0.7530549771920028 

test score: 0.7581001273029934 

R2 score: 0.7581001273029934 

mean_squared_error: 115.63773446140532 

mean_absolute_error: 7.982060489455835 

Step:  rfr
training score: 0.9299693660798957 

test score: 0.8118101275743208 

R2 score: 0.8118101275743208 

mean_squared_error: 89.96222384599758 

mean_absolute_error: 6.6979087172985405 

Step:  dtr
training score: 0.9971449151588991 

test score: 0.6071101018161782 

R2 score: 0.6071101018161782 

mean_squared_error: 187.81695588429127 

mean_absolute_error: 9.245266137628143 

Step:  abr
training score: 0.6836098777066395 

test score: 0.6835410056881939 

R2 score: 0.6835410056881939 

mean_squared_error: 151.27995208988327 

mean_absolute_error: 10.009931575640014 



In [164]:
grid_results

Unnamed: 0,step,train_accuracy,test_accuracy,r2_score,mean_squared_error,mean_absolute_error
0,lr,0.753055,0.7581,0.7581,115.637734,7.98206
1,rfr,0.971538,0.809383,0.809383,91.122428,6.764254
2,dtr,0.997145,0.60711,0.60711,187.816956,9.245266
3,abc,0.035342,0.036447,0.658968,163.026954,9.386734
