## Project Check-In 2: Linear Regression

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklego.linear_model import LADRegression
from sklearn.feature_selection import SequentialFeatureSelector

In [2]:
spotify_cleaned = pd.read_excel('clean_data.xlsx')
spotify_cleaned.columns

Index(['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name',
       'popularity', 'duration_ms', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'track_genre'],
      dtype='object')

In [3]:
# Print correlations to popularity based on numeric predictor variables
correlations = spotify_cleaned.corr(numeric_only=True)
correlations[correlations.abs() > 0.2].replace(1, np.nan).dropna(how='all', axis=1).dropna(how='all', axis=0)

Unnamed: 0,explicit,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
explicit,,,,,0.325346,,,,,
danceability,,,,0.254835,,,,,0.484892,
energy,,,,0.759726,,-0.733844,,,0.249962,0.247234
loudness,,0.254835,0.759726,,,-0.582554,-0.430668,,0.279826,0.213648
speechiness,0.325346,,,,,,,0.220905,,
acousticness,,,-0.733844,-0.582554,,,,,,-0.217672
instrumentalness,,,,-0.430668,,,,,-0.325794,
liveness,,,,,0.220905,,,,,
valence,,0.484892,0.249962,0.279826,,,-0.325794,,,
tempo,,,0.247234,0.213648,,-0.217672,,,,


We chose loudness as our response variable because the correlation matrix shows that it has the highest correlation with the highest number of other variables. 

In [4]:
# Split data into testing, validation, and training sets
# 60% training, 20% validation, 20% testing

training_data = spotify_cleaned.sample(frac=0.6, random_state=47)
validation_data = spotify_cleaned.drop(training_data.index).sample(frac=0.5, random_state=47)
testing_data = spotify_cleaned.drop(training_data.index).drop(validation_data.index)

In [5]:
vars = ["instrumentalness", "speechiness", "energy", "valence", "danceability", "acousticness"]
response = "loudness"
lad_fit = LADRegression()
lad_fit.fit(training_data[vars], y=training_data[response])
lad_fit.coef_, lad_fit.intercept_

(array([-4.31233799, -2.7860288 , 12.80024597, -1.15755635,  2.87059887,
        -0.24312433]),
 np.float64(-16.272224418581))

In [6]:
# For single predictor variable only
# fig = px.scatter(training_data, x=vars[0], y=response)
# fig.add_trace(go.Scatter(x=training_data[vars[0]], y=lad_fit.intercept_ + lad_fit.coef_[0] * training_data[vars[0]], mode='lines', name='LAD Fit'))

In [7]:
# LS Model
ls_fit = LinearRegression()
ls_fit.fit(X=training_data[vars], y=training_data[response])
ls_fit.coef_, ls_fit.intercept_

(array([-4.87876109, -3.82274955, 14.02495087, -1.23172707,  4.23668704,
        -0.29770919]),
 np.float64(-17.892421374198385))

In [8]:
# Scatter plot for single predictor variable only
# fig = px.scatter(training_data, x=vars[0], y=response)
# fig.add_trace(go.Scatter(x=training_data[vars[0]], y=ls_fit.intercept_ + ls_fit.coef_[0] * training_data[vars[0]], mode='lines', name='LS Fit'))

In [9]:
# # Select predictor variables
# selector = SequentialFeatureSelector(ls_fit, n_features_to_select=2, director="forward", scoring="neg_mean_squared_error", cv=5)
# selector.fit(training_data[vars], training_data[response])
# selector.get_feature_names_out()

In [10]:
pred_train_df = pd.DataFrame({'true': training_data[response], 'ls_pred': ls_fit.predict(training_data[vars]), 'lad_pred': lad_fit.predict(training_data[vars])})

pred_val_df = pd.DataFrame({'true': validation_data[response], 'ls_pred': ls_fit.predict(validation_data[vars]), 'lad_pred': lad_fit.predict(validation_data[vars])})

In [11]:
# calculate the rMSE, MAE, MAD, correlation, and R2 of the true price with the LS and LAD predictions
print('Training LS rMSE:', np.sqrt(mean_squared_error(pred_train_df['true'], pred_train_df['ls_pred'])))
print('Training LS MAE:', mean_absolute_error(pred_train_df['true'], pred_train_df['ls_pred']))
print('Training LS MAD:', np.median(np.abs(pred_train_df['true'] - pred_train_df['ls_pred'])))
print('Training LS correlation:', np.corrcoef(pred_train_df['true'], pred_train_df['ls_pred'])[0, 1])
print('Training LS R2:', r2_score(pred_train_df['true'], pred_train_df['ls_pred']))

print('Training LAD rMSE:', np.sqrt(mean_squared_error(pred_train_df['true'], pred_train_df['lad_pred'])))
print('Training LAD MAE:', mean_absolute_error(pred_train_df['true'], pred_train_df['lad_pred']))
print('Training LAD MAD:', np.median(np.abs(pred_train_df['true'] - pred_train_df['lad_pred'])))
print('Training LAD correlation:', np.corrcoef(pred_train_df['true'], pred_train_df['lad_pred'])[0, 1])
print('Training LAD R2:', r2_score(pred_train_df['true'], pred_train_df['lad_pred']))

Training LS rMSE: 2.823590696673957
Training LS MAE: 2.023759078505459
Training LS MAD: 1.5285941784908168
Training LS correlation: 0.831285043268912
Training LS R2: 0.6910348231625951
Training LAD rMSE: 2.8814030501008787
Training LAD MAE: 1.987007779424985
Training LAD MAD: 1.4550285961532232
Training LAD correlation: 0.8303052280698789
Training LAD R2: 0.678253321806673


In [12]:
# calculate the rMSE, MAE, MAD, correlation, and R2 of the true price with the LS and LAD predictions
print('Training LS rMSE:', np.sqrt(mean_squared_error(pred_val_df['true'], pred_val_df['ls_pred'])))
print('Training LS MAE:', mean_absolute_error(pred_val_df['true'], pred_val_df['ls_pred']))
print('Training LS MAD:', np.median(np.abs(pred_val_df['true'] - pred_val_df['ls_pred'])))
print('Training LS correlation:', np.corrcoef(pred_val_df['true'], pred_val_df['ls_pred'])[0, 1])
print('Training LS R2:', r2_score(pred_val_df['true'], pred_val_df['ls_pred']))

print('Training LAD rMSE:', np.sqrt(mean_squared_error(pred_val_df['true'], pred_val_df['lad_pred'])))
print('Training LAD MAE:', mean_absolute_error(pred_val_df['true'], pred_val_df['lad_pred']))
print('Training LAD MAD:', np.median(np.abs(pred_val_df['true'] - pred_val_df['lad_pred'])))
print('Training LAD correlation:', np.corrcoef(pred_val_df['true'], pred_val_df['lad_pred'])[0, 1])
print('Training LAD R2:', r2_score(pred_val_df['true'], pred_val_df['lad_pred']))

Training LS rMSE: 2.9059976921267308
Training LS MAE: 2.071505919089823
Training LS MAD: 1.5587623471715415
Training LS correlation: 0.8290252937130539
Training LS R2: 0.6871310902579157
Training LAD rMSE: 2.977191147512716
Training LAD MAE: 2.033404221080465
Training LAD MAD: 1.4660372130404506
Training LAD correlation: 0.8278265234112687
Training LAD R2: 0.6716134835918869


In [13]:
from sklearn.linear_model import  Ridge, Lasso
from sklearn.model_selection import cross_val_score, cross_validate


In [16]:
# change this

X = training_data.drop(columns=['loudness'])[vars]
# scale the predictors
X_std = (X - X.mean()) / X.std()
y = training_data['loudness']

In [19]:
alphas = np.logspace(-1, 6, 100)
ridge_cv_scores = []
# create a for loop to compute the cross-validation score for each alpha value
for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    ridge_cv = cross_validate(estimator=ridge,
                              X=X_std,
                              y=y,
                              cv=10,
                              scoring='neg_root_mean_squared_error')
    ridge_cv_scores.append({'alpha': alpha,
                            'log_alpha': np.log(alpha),
                            'test_mse': -np.mean(ridge_cv['test_score'])})

# convert the cross-validation scores into a data frame
ridge_cv_scores_df = pd.DataFrame(ridge_cv_scores)

# plot the cross-validation scores as a function of alpha
px.line(ridge_cv_scores_df,
        x='log_alpha',
        y='test_mse',
        title='Ridge')

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [20]:
ridge_cv_scores_df 

Unnamed: 0,alpha,log_alpha,test_mse
0,0.100000,-2.302585,2.823900
1,0.117681,-2.139776,2.823900
2,0.138489,-1.976967,2.823900
3,0.162975,-1.814158,2.823900
4,0.191791,-1.651349,2.823900
...,...,...,...
95,521400.828800,13.164274,4.603648
96,613590.727341,13.327083,4.663633
97,722080.901839,13.489892,4.717384
98,849753.435909,13.652702,4.765208


In [23]:

# identify the value of alpha that minimizes the cross-validation score for ridge
ridge_alpha_min = ridge_cv_scores_df.sort_values(by='test_mse').head(1).alpha.values[0]
# compute the min MSE and the SE of the MSE
mse_se_ridge = ridge_cv_scores_df['test_mse'].std() / np.sqrt(10)
mse_min_ridge = ridge_cv_scores_df['test_mse'].min()


# identify the value of alpha that minimizes the cross-validation score for ridge within 1SE
ridge_alpha_1se = ridge_cv_scores_df[(ridge_cv_scores_df['test_mse'] <= mse_min_ridge + mse_se_ridge) &
                                     (ridge_cv_scores_df['test_mse'] >= mse_min_ridge - mse_se_ridge)].sort_values(by='alpha', ascending=False).head(1).alpha.values[0]


In [24]:
print('Ridge (min): ', ridge_alpha_min)
print('Ridge (1SE): ', ridge_alpha_1se)
print('Lasso (min): ', lasso_alpha_min)
print('Lasso (1SE): ', lasso_alpha_1se)

Ridge (min):  0.1
Ridge (1SE):  14508.287784959402
Lasso (min):  0.1
Lasso (1SE):  0.8111308307896871
