# Model Development

## Setup

In [1]:
import math
import os
import sys

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LassoCV, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
print(os.getcwd())
print(sys.executable)

/Users/brianrice/dev/2021-msia423-rice-brian-project/notebooks/develop
/Users/brianrice/dev/2021-msia423-rice-brian-project/venv/bin/python


In [3]:
S3_BUCKET = "s3://2021-msia423-rice-brian"
S3_CLEANED_PATH = "cleaned/P4KxSpotify.csv"
IN_PATH = os.path.join(S3_BUCKET, S3_CLEANED_PATH)

RANDOM_SEED = 0

In [4]:
df = pd.read_csv(IN_PATH)
df.head()

Unnamed: 0,artist,album,reviewauthor,score,releaseyear,reviewdate,recordlabel,genre,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,Studio 1,Studio 1,Andy Battaglia,8.5,2009.0,2009-02-18,Studio,Electronic,0.511917,0.499667,5.25,-5.626583,0.031983,0.724917,0.024493,0.165367,0.555083,101.395167
1,John Fahey,The Great Santa Barbara Oil Slick,Mark Richardson,8.2,2005.0,2005-02-13,Water,Folk/Country,0.369765,0.325412,4.470588,-19.153824,0.148624,0.647053,0.559133,0.527782,0.179465,107.622647
2,Reigning Sound,Too Much Guitar,Stephen M. Deusner,8.3,2004.0,2004-08-19,In the Red,Electronic,0.253943,0.912857,4.428571,-1.0895,0.0555,0.000253,0.751214,0.199071,0.552786,133.8955
3,The Red Thread,After the Last,Chris Dahlen,7.3,2003.0,2003-07-17,Badman,Rock,0.4254,0.433474,5.7,-12.871,0.02826,0.310325,0.224137,0.12515,0.4514,104.3542
4,Mac Miller,Swimming,Evan Rytlewski,7.5,2018.0,2018-08-03,Warner Bros.,Rap,0.624846,0.438154,4.153846,-9.456077,0.170246,0.652462,0.012819,0.121131,0.281138,122.121308


## Data preparation

In [5]:
X = df.drop(columns=["score"])
y = df["score"]

# 60/20/20 split for train/validation/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=RANDOM_SEED)

#### Define input transformations and normalizations

Including artist name, album name, and album reviewer name is both intractable and poor from a business standpoint. Artists who use our application have no control over their reviewer, and incorporating the similarity of the artist/album name to existing entries becomes very complex very quickly. For these reasons, they're not included as predictors.

In [6]:
numeric_features = [
    "releaseyear", "danceability", "key", "loudness", "speechiness",
    "acousticness", "instrumentalness", "liveness", "valence", "tempo"
]
numeric_transformer = StandardScaler()

categorical_features = ["genre"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

In [7]:
preprocessor = ColumnTransformer(transformers=[
    ("numeric", numeric_transformer, numeric_features),
    ("categorical", categorical_transformer, categorical_features)
])

## Linear regression

In [8]:
def evaluate_performance(pipeline, X_train, y_train, X_val, y_val):
    mse_train = mean_squared_error(pipeline.predict(X_train), y_train)
    rmse_train = math.sqrt(mse_train)
    print("Training RMSE:\t\t%.4f" % rmse_train)

    mse_val = mean_squared_error(pipeline.predict(X_val), y_val)
    rmse_val = math.sqrt(mse_val)
    print("Validation RMSE:\t%.4f" % rmse_val)

    print("R-squared value:\t%.4f" % pipeline.score(X_train, y_train))

In [9]:
pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("predictor", LinearRegression())
])

In [10]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric', StandardScaler(),
                                                  ['releaseyear',
                                                   'danceability', 'key',
                                                   'loudness', 'speechiness',
                                                   'acousticness',
                                                   'instrumentalness',
                                                   'liveness', 'valence',
                                                   'tempo']),
                                                 ('categorical',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['genre'])])),
                ('predictor', LinearRegression())])

In [11]:
evaluate_performance(pipe, X_train, y_train, X_val, y_val)

Training RMSE:		1.2140
Validation RMSE:	1.2580
R-squared value:	0.0379


## Lasso regression

In [12]:
# Use CV to determine best regularization parameter, even though it
# is a bit redundant to use CV with a train/val/test split strategy
pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("predictor", LassoCV(cv=10, random_state=RANDOM_SEED))
])
pipe.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric', StandardScaler(),
                                                  ['releaseyear',
                                                   'danceability', 'key',
                                                   'loudness', 'speechiness',
                                                   'acousticness',
                                                   'instrumentalness',
                                                   'liveness', 'valence',
                                                   'tempo']),
                                                 ('categorical',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['genre'])])),
                ('predictor', LassoCV(cv=10, random_state=0))])

In [13]:
evaluate_performance(pipe, X_train, y_train, X_val, y_val)

Training RMSE:		1.2141
Validation RMSE:	1.2581
R-squared value:	0.0378


In [14]:
# Model coefficients
best_lasso = pipe["predictor"]
pd.Series(
    data=best_lasso.coef_,
    index=numeric_features + list(pipe["preprocessor"].transformers_[1][1].get_feature_names())
)

releaseyear        -0.122867
danceability       -0.061235
key                 0.011892
loudness           -0.073636
speechiness         0.043457
acousticness        0.001730
instrumentalness    0.082565
liveness            0.006299
valence             0.026045
tempo              -0.004686
x0_Electronic      -0.190892
x0_Experimental     0.170154
x0_Folk/Country     0.114870
x0_Global           0.245719
x0_Jazz             0.326637
x0_Metal           -0.000000
x0_Missing         -0.263923
x0_Pop/R&B         -0.009791
x0_Rap              0.000000
x0_Rock            -0.115855
x0_none             0.000000
dtype: float64

## Nearest neighbors

In [15]:
for neighbors in range(4, 21, 2):
    print("KNN with K =", neighbors, "----------------")
    pipe = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("predictor", KNeighborsRegressor(n_neighbors=neighbors))
    ])

    pipe.fit(X_train, y_train)
    evaluate_performance(pipe, X_train, y_train, X_val, y_val)

KNN with K = 4 ----------------
Training RMSE:		1.0124
Validation RMSE:	1.3583
R-squared value:	0.3309
KNN with K = 6 ----------------
Training RMSE:		1.0701
Validation RMSE:	1.3181
R-squared value:	0.2525
KNN with K = 8 ----------------
Training RMSE:		1.0987
Validation RMSE:	1.2928
R-squared value:	0.2120
KNN with K = 10 ----------------
Training RMSE:		1.1145
Validation RMSE:	1.2760
R-squared value:	0.1891
KNN with K = 12 ----------------
Training RMSE:		1.1275
Validation RMSE:	1.2670
R-squared value:	0.1701
KNN with K = 14 ----------------
Training RMSE:		1.1394
Validation RMSE:	1.2549
R-squared value:	0.1525
KNN with K = 16 ----------------
Training RMSE:		1.1450
Validation RMSE:	1.2557
R-squared value:	0.1441
KNN with K = 18 ----------------
Training RMSE:		1.1498
Validation RMSE:	1.2535
R-squared value:	0.1370
KNN with K = 20 ----------------
Training RMSE:		1.1545
Validation RMSE:	1.2516
R-squared value:	0.1299


In [16]:
pd.DataFrame(zip(pipe.predict(X_train), y_train), columns=["prediction", "label"]).head(10)

Unnamed: 0,prediction,label
0,6.58,6.3
1,7.13,7.2
2,7.055,7.4
3,7.15,7.3
4,7.05,8.3
5,6.88,6.8
6,6.485,6.6
7,6.23,7.7
8,6.48,7.1
9,7.39,8.8


## Random forest

In [17]:
pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("predictor", RandomForestRegressor(
        n_estimators=200,
        min_samples_leaf=2,
        ccp_alpha=0.005,
        random_state=RANDOM_SEED,
        verbose=0,
        n_jobs=4
        )
    )
])

pipe.fit(X_train, y_train)
evaluate_performance(pipe, X_train, y_train, X_val, y_val)

Training RMSE:		1.1814
Validation RMSE:	1.2307
R-squared value:	0.0889


In [18]:
# Feature importance measures
pd.Series(
    data=pipe["predictor"].feature_importances_,
    index=numeric_features + list(pipe["preprocessor"].transformers_[1][1].get_feature_names())
)

releaseyear         0.637025
danceability        0.053997
key                 0.015017
loudness            0.100343
speechiness         0.014362
acousticness        0.029007
instrumentalness    0.068490
liveness            0.011938
valence             0.014277
tempo               0.013369
x0_Electronic       0.000690
x0_Experimental     0.035943
x0_Folk/Country     0.000160
x0_Global           0.000000
x0_Jazz             0.001779
x0_Metal            0.000935
x0_Missing          0.000584
x0_Pop/R&B          0.000218
x0_Rap              0.000402
x0_Rock             0.001466
x0_none             0.000000
dtype: float64

## Gradient-boosted tree

In [22]:
pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("predictor", GradientBoostingRegressor(
        learning_rate=0.075,
        ccp_alpha=0.001,
        random_state=RANDOM_SEED
    ))
])

pipe.fit(X_train, y_train)
evaluate_performance(pipe, X_train, y_train, X_val, y_val)

Training RMSE:		1.1775
Validation RMSE:	1.2285
R-squared value:	0.0948


In [23]:
# Feature importance measures
pd.Series(
    data=pipe["predictor"].feature_importances_,
    index=numeric_features + list(pipe["preprocessor"].transformers_[1][1].get_feature_names())
)

releaseyear         0.578512
danceability        0.071036
key                 0.021205
loudness            0.099546
speechiness         0.024585
acousticness        0.013766
instrumentalness    0.062735
liveness            0.010869
valence             0.012608
tempo               0.008746
x0_Electronic       0.001881
x0_Experimental     0.055162
x0_Folk/Country     0.008347
x0_Global           0.002102
x0_Jazz             0.014337
x0_Metal            0.000000
x0_Missing          0.007167
x0_Pop/R&B          0.000000
x0_Rap              0.007397
x0_Rock             0.000000
x0_none             0.000000
dtype: float64