In [61]:
# Import our dependencies
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler,OneHotEncoder,MinMaxScaler 
import pandas as pd
import tensorflow as tf
import pandas as pd 
import numpy as np
import math
from config import db_password
import psycopg2
from sqlalchemy import create_engine
from sklearn.linear_model import LogisticRegression , Lasso , Ridge
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score ,confusion_matrix ,classification_report , mean_squared_error
import os
from tensorflow.keras.callbacks import ModelCheckpoint



In [62]:
# Create connection to server 
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/songs_data"
#Create Engine
engine = create_engine(db_string)
# Import Table from Database 
songs_df = pd.read_sql_table("songs_processed" , con=engine)

songs_df = songs_df.drop("index",axis=1)

songs_df.head()


Unnamed: 0,explicit,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,valence,tempo,Duration (minutes)
0,0,2014,0,0.789,0.915,0,-3.263,1,0.248,0.0135,9e-06,0.66,127.955,3.06
1,0,2013,68,0.797,0.608,6,-6.096,0,0.0584,0.00112,7.7e-05,0.402,127.999,4.66
2,1,2002,85,0.548,0.847,1,-3.237,1,0.186,0.0622,0.0,0.1,171.447,4.96
3,0,2006,58,0.711,0.761,8,-3.04,1,0.225,0.067,0.0,0.718,95.824,4.06
4,0,2017,73,0.613,0.764,2,-6.509,1,0.136,0.0527,0.0,0.417,160.015,3.47


## Feature Engineering and Selection :

The first task is to see how a simple linear regression model does on the features that we selected. Based on if the initial model appears to be overfitting/underfitting, We can make some modeling decisions.

 

### Modeling

In [63]:
X_feature = songs_df.drop('popularity', axis=1)
y_feature = songs_df['popularity']
X_train, X_test, y_train, y_test = train_test_split(X_feature, y_feature, 
                                                            test_size=0.2, random_state=78)

The best path move forward is to experiment with using a LASSO or Ridge regression to improve the model by reducing complexity in the features.

In order to observe if LASSO or Ridge regression improves overfitting, we need to take a look at the train score and validation score for each. 

#### LASSO

In [88]:
def cross_validate(X, y, estimator, cv=5):
    """
    Code was taken from https://scikit-learn.org/stable/modules/cross_validation.html and adjusted
    Performs a k-fold cross validation on a trained model and reports results.
    :param pandas.DataFrame X: Features to train linear regression on.
    :param pandas.DataFrame y: Targets to train linear regression on.
    :param estimator: scikit-learn model such as LinearRegression
    :param int cv: Number of K-Folds for cross validation.
    """
    kf = KFold(n_splits=cv, shuffle=True)
    r2_train, r2_val, rmse = [], [], []
    for train_ind, val_ind in kf.split(X, y):
        X_train, y_train = X.iloc[train_ind], y.iloc[train_ind]
        X_val, y_val = X.iloc[val_ind], y.iloc[val_ind]

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)

        estimator.fit(X_train_scaled, y_train)
        r2_train.append(estimator.score(X_train_scaled, y_train))
        r2_val.append(estimator.score(X_val_scaled, y_val))
        rmse.append(math.sqrt(mean_squared_error(y_val, estimator.predict(X_val_scaled))))

    print(f'R2 Train Avg: {np.mean(r2_train)}')
    print(f'R2 Val Avg: {np.mean(r2_val)}')
    print(f'RMSE Avg: {np.mean(rmse)}')
    print('----Coefficients----')
    for coef, col in zip(estimator.coef_, X.columns):
        print(f'{col}: {coef}')

In [89]:
for alpha in [0.01, 0.1, 1, 10]:
    print(f'For alpha = {alpha}')
    cross_validate(X_train, y_train, Lasso(alpha))
    print('---------')
    print('---------')

For alpha = 0.01
R2 Train Avg: 0.012221231238003338
R2 Val Avg: -0.01312635798228876
RMSE Avg: 21.363056742044357
----Coefficients----
explicit: 0.9541535893316396
year: -0.48162706543689454
danceability: 0.38926948728439387
energy: -1.392431866897764
key: 0.4573070229369845
loudness: 1.9001962868729705
mode: -0.5297250289389459
speechiness: -0.1966383073410803
acousticness: 0.20288731823719924
instrumentalness: -0.5818418079699004
valence: -1.2681434219411805
tempo: 0.1274886229677933
Duration (minutes): 0.7557472428555843
---------
---------
For alpha = 0.1
R2 Train Avg: 0.011395494556577268
R2 Val Avg: -0.0048700588311272105
RMSE Avg: 21.290573981743506
----Coefficients----
explicit: 0.7982566924315921
year: 0.0
danceability: 0.17833875329696922
energy: -0.044526672143241214
key: 0.5804958024953266
loudness: 1.0308973342213432
mode: -0.39480789652424925
speechiness: 0.42424646697487517
acousticness: 0.33848188634819887
instrumentalness: -0.6407804024545225
valence: -0.91782381574617



R2 Train Avg: 0.001813800098488283
R2 Val Avg: -0.008877465120292305
RMSE Avg: 21.24241780135404
----Coefficients----
explicit: 0.0
year: 0.0
danceability: -0.0
energy: -0.0
key: 0.0
loudness: 0.0
mode: -0.0
speechiness: 0.0
acousticness: 0.0
instrumentalness: -0.0
valence: -0.15429257370094296
tempo: 0.0
Duration (minutes): 0.07837621644830522
---------
---------
For alpha = 10
R2 Train Avg: 0.0
R2 Val Avg: -0.006694887141201145
RMSE Avg: 21.220495936521377
----Coefficients----
explicit: 0.0
year: 0.0
danceability: -0.0
energy: -0.0
key: 0.0
loudness: 0.0
mode: -0.0
speechiness: 0.0
acousticness: 0.0
instrumentalness: -0.0
valence: -0.0
tempo: 0.0
Duration (minutes): 0.0
---------
---------




In [90]:
for alpha in [0.01, 0.1, 1, 10]:
    print(f'For alpha = {alpha}')
    cross_validate(X_train, y_train, Ridge(alpha))
    print('---------')
    print('---------')

For alpha = 0.01
R2 Train Avg: 0.011705277101274714
R2 Val Avg: -0.01452507921294215
RMSE Avg: 21.304083812242816
----Coefficients----
explicit: 1.0776057658846767
year: 0.5522533211301062
danceability: 0.09456584671273813
energy: 0.04206270739805924
key: 0.8285701572068294
loudness: 1.017534409992465
mode: -0.13214715544076613
speechiness: -0.22305332432121158
acousticness: 0.3103184916419263
instrumentalness: -0.7728922272516616
valence: -0.2129487663087804
tempo: 0.21401438004697954
Duration (minutes): 1.016266915007899
---------
---------
For alpha = 0.1
R2 Train Avg: 0.012083004287338616
R2 Val Avg: -0.013172476358926067
RMSE Avg: 21.335361351235093
----Coefficients----
explicit: 0.7525631296918037
year: -0.09759588141496817
danceability: 0.6411312592627902
energy: -0.03138378051483699
key: 0.4976112892925988
loudness: 1.3268883030664111
mode: -0.45488859875838367
speechiness: 0.032505808241906194
acousticness: 0.3517401183973264
instrumentalness: -0.3626003565988787
valence: -1.1



R2 Train Avg: 0.013359638080429592
R2 Val Avg: -0.020793171284111577
RMSE Avg: 21.450091554546873
----Coefficients----
explicit: 0.5277520290512405
year: -0.5434665840919178
danceability: -0.02549305695027149
energy: -1.8036859584494256
key: 0.571101184174279
loudness: 2.329946145947073
mode: -0.3133479162265774
speechiness: -0.32823173187441873
acousticness: 0.030007082768460574
instrumentalness: 0.04879475439893289
valence: -1.059960897639633
tempo: -0.14298532172244463
Duration (minutes): 0.18826304767292112
---------
---------
For alpha = 10
R2 Train Avg: 0.011903983734997903
R2 Val Avg: -0.011548558233642137
RMSE Avg: 21.311874329999444
----Coefficients----
explicit: 0.9245326089301544
year: -0.1161927742195664
danceability: 0.02027458465318869
energy: -0.8620300423689259
key: 0.8415188330709068
loudness: 1.2554934324024456
mode: -0.5446613047532293
speechiness: -0.14171964632202852
acousticness: 0.3422165692664019
instrumentalness: -0.4167545333086257
valence: -0.4211494694728008



### Based on Ridge and Lasso ,removing features and  experiment with different models 

In [91]:
X_train.columns

Index(['explicit', 'year', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo',
       'Duration (minutes)'],
      dtype='object')

In [124]:
X_features_removed = X_train.copy()
X_features_removed = X_train.drop([ 'tempo','key','mode' ,'danceability','instrumentalness','year','speechiness',], 
                                       axis=1)


In [125]:
for alpha in [0.01, 0.1, 1, 10]:
    print(f'For alpha = {alpha}')
    cross_validate(X_features_removed, y_train, Lasso(alpha))
    print('---------')
    print('---------')

For alpha = 0.01
R2 Train Avg: 0.00901040535598594
R2 Val Avg: -0.0016619140329377836
RMSE Avg: 21.226605950340492
----Coefficients----
explicit: 0.8029352082416287
energy: -0.3634859591219765
loudness: 1.7375974194229504
acousticness: 0.4633359577295334
valence: -0.6375959187685943
Duration (minutes): 0.9568925584262226
---------
---------
For alpha = 0.1




R2 Train Avg: 0.00978112257024275
R2 Val Avg: -0.008685180780121881
RMSE Avg: 21.310607212812485
----Coefficients----
explicit: 0.8314607186524949
energy: -0.7065791832009405
loudness: 1.3357181402935245
acousticness: 0.2552879100768874
valence: -0.8935055668996955
Duration (minutes): 0.21558364977801542
---------
---------
For alpha = 1




R2 Train Avg: 0.0012137971397835567
R2 Val Avg: -0.005643843599721565
RMSE Avg: 21.258510529468264
----Coefficients----
explicit: 0.43418239340968007
energy: 0.0
loudness: 0.0
acousticness: 0.0
valence: -0.0
Duration (minutes): 0.0
---------
---------
For alpha = 10
R2 Train Avg: 0.0
R2 Val Avg: -0.004944604360007698
RMSE Avg: 21.220133851762046
----Coefficients----
explicit: 0.0
energy: 0.0
loudness: 0.0
acousticness: -0.0
valence: -0.0
Duration (minutes): 0.0
---------
---------


