In [1]:
# modules
from common import evaluate
# working with data
import pandas as pd
# modelling
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
import pickle

In [2]:
# Read in dataset from csv
songs = pd.read_csv("songs.csv", index_col=False)
# Omit time_signature and mode just as we did with random forest model
df_no_genres = songs.drop(columns=['track_id', 'track_name', 'genres'])

In [3]:
# Split dataset into train and test
y = df_no_genres['user_like']
X = df_no_genres.drop(columns='user_like')
X, y = shuffle(X, y, random_state=1234)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)

Using default parameters as our baseline model...

In [4]:
# Using default settings
base_model = LogisticRegression(random_state = 123)

In [5]:
# Fit the model with hyperparameter tuning
base_model.fit(X_train, y_train)

# Make predictions on the test set using the best model
y_pred = base_model.predict(X_test)

# Evaluate the best model
evaluate(base_model, X_test, y_test)

Average Error: 27.835051546391753
Accuracy = 72.16494845360825


Perhaps we can improve the model by tuning a few hyperparameters.

In [6]:
base_model.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 123,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

According to sklearn's documentation on the logistic regression model, different solvers support different penalties:
‘lbfgs’ - [‘l2’, None]
‘liblinear’ - [‘l1’, ‘l2’]
‘newton-cholesky’ - [‘l2’, None]

I'll test liblinear with l1 penalty separately for comparison.

In [7]:
grid_values = {'penalty': ['l2'],
 'C': [0.001, 0.01, 0.1],
 'random_state': [123],
 'solver': ['lbfgs', 'newton-cholesky', 'liblinear'],
 'max_iter': [1000]}

In [8]:
model_gridsearch = GridSearchCV(estimator = base_model, 
                                param_grid = grid_values,                                  
                                cv = 10,                                 
                                n_jobs = -1)

In [9]:
model_gridsearch.fit(X_train, y_train)

In [10]:
model_gridsearch.best_params_

{'C': 0.01,
 'max_iter': 1000,
 'penalty': 'l2',
 'random_state': 123,
 'solver': 'newton-cholesky'}

In [11]:
evaluate(model_gridsearch, X_test, y_test)

Average Error: 29.896907216494846
Accuracy = 70.10309278350516


No improvement in accuracy over the test set... And now to test liblinear solver...

In [12]:
grid_values = {'penalty': ['l1'],
 'C': [0.001, 0.01, 0.1],
 'random_state': [123],
 'solver': ['liblinear'],
 'max_iter': [1000]}

In [13]:
model_gridsearch = GridSearchCV(estimator = base_model, 
                                param_grid = grid_values,                                  
                                cv = 10,                                 
                                n_jobs = -1)

In [14]:
model_gridsearch.fit(X_train, y_train)

In [15]:
model_gridsearch.best_params_

{'C': 0.01,
 'max_iter': 1000,
 'penalty': 'l1',
 'random_state': 123,
 'solver': 'liblinear'}

In [16]:
evaluate(model_gridsearch, X_test, y_test)

Average Error: 30.927835051546392
Accuracy = 69.0721649484536


Seems like the default parameters (i.e. lbfgs solver with l2 penalty) yields the best results.

In [17]:
# fit on whole dataset
base_model.fit(X, y)
# write model to disk
pickle.dump(base_model, open('lr.sav', 'wb'))