# Spotify Regression Problem 2024

In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict, train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, PolynomialFeatures, StandardScaler

def output_csv(df_test, predicted_labels, name):
    df = pd.DataFrame()
    df['Id'] = df_test['Id']
    df['top genre'] = predicted_labels
    df.to_csv(name, index=False)

def decode_labels(label_encoder, predicted_labels):
    return label_encoder.inverse_transform(predicted_labels)

def encode_labels(txt_labels):
    label_encoder = sk.preprocessing.LabelEncoder()
    label_encoder.fit(txt_labels)
    encoded = label_encoder.transform(txt_labels)
    return label_encoder, encoded

def eval_acc(true_label, predicted_label):
    print("acc: ", sk.metrics.accuracy_score(true_label, predicted_label))
    print("f1: ", sk.metrics.f1_score(true_label, predicted_label, average='macro'))

In [3]:
df_train = pd.read_csv("RTrain.csv", encoding='utf-8')
df_test = pd.read_csv("RTest.csv", encoding='utf-8')

df_train

Unnamed: 0,Id,title,artist,top genre,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop
0,1,My Happiness,Connie Francis,adult standards,1996,107,31,45,-8,13,28,150,75,3,44
1,2,Unchained Melody,The Teddy Bears,,2011,114,44,53,-8,13,47,139,49,3,37
2,3,How Deep Is Your Love,Bee Gees,adult standards,1979,105,36,63,-9,13,67,245,11,3,77
3,4,Woman in Love,Barbra Streisand,adult standards,1980,170,28,47,-16,13,33,232,25,3,67
4,5,Goodbye Yellow Brick Road - Remastered 2014,Elton John,glam rock,1973,121,47,56,-8,15,40,193,45,3,63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
448,449,But Not For Me,Ella Fitzgerald,adult standards,1959,80,22,18,-17,10,16,214,92,4,45
449,450,Surf City,Jan & Dean,brill building pop,2010,148,81,53,-13,23,96,147,50,3,50
450,451,Dilemma,Nelly,dance pop,2002,168,55,73,-8,20,61,289,23,14,77
451,452,It's Gonna Be Me,*NSYNC,boy band,2000,165,87,64,-5,6,88,191,5,8,62


In [4]:
df_test

Unnamed: 0,Id,title,artist,top genre,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch
0,454,Pump It,The Black Eyed Peas,dance pop,2005,154,93,65,-3,75,74,213,1,18
1,455,"Circle of Life - From ""The Lion King""/Soundtra...",Elton John,glam rock,1994,161,39,30,-15,11,14,292,26,3
2,456,We Are The Champions - Remastered 2011,Queen,glam rock,1977,64,46,27,-7,12,18,179,38,3
3,457,Insomnia - Radio Edit,Faithless,big beat,2010,127,92,71,-9,37,53,216,6,4
4,458,This Eve of Parting,John Hartford,appalachian folk,2018,115,46,56,-12,21,34,153,18,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,563,Candy Shop,50 Cent,east coast hip hop,2005,125,57,61,-8,38,76,209,3,47
110,564,Dragostea Din Tei - Italian Version,O-Zone,bubblegum dance,2010,130,89,67,-6,10,80,215,4,3
111,565,Big Poppa - 2005 Remaster,The Notorious B.I.G.,east coast hip hop,1994,84,58,78,-7,14,76,253,43,27
112,566,YMCA - Original Version 1978,Village People,disco,1978,127,97,72,-5,12,73,287,6,14


### Filtering the Data

In [5]:
sum(df_train.isnull().values.ravel())

15

In [6]:
# Removing the null values

df_train_nona = df_train.dropna()
df_test_nona = df_test.dropna()

## Data Visualisation

In [7]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# correlation_matrix = df_train_nona.corr()
# plt.figure(figsize=(8, 6))
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
# plt.title('Correlation Matrix')
# plt.show()

## Encoding the data

In [8]:
# Decided to encode the title and artist values but later in the code I have removed the 
# artist and title columns entirely. This is because using One Hot Encoding we dont have any artists or song titles
# that match between the training and test set.


titleEncoder = pd.get_dummies(df_train_nona['title'], prefix='title')
artistEncoder = pd.get_dummies(df_train_nona['artist'], prefix='artist')
df_train_encoded = pd.concat([df_train_nona.drop(['title', 'artist'], axis=1), titleEncoder, artistEncoder], axis=1)


testTitleEncoder = pd.get_dummies(df_test_nona['title'], prefix='title')
testArtistEncoder = pd.get_dummies(df_test_nona['artist'], prefix='artist')
df_test_encoded = pd.concat([df_test_nona.drop(['title', 'artist'], axis=1), testTitleEncoder, testArtistEncoder], axis=1)

In [9]:
# sort = df_train_nona.groupby(by = ["top genre"]).size()
# sort = sort.sort_values(ascending = False)
# sort.head(10)

In [10]:
label_encoder = LabelEncoder()
label_encoder.fit(df_train_nona['top genre'])
encoded_genre = label_encoder.transform(df_train_nona['top genre'])
df_train_nona['top genre en'] = encoded_genre

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_nona['top genre en'] = encoded_genre


In [11]:
selected_columns = ['year', 'bpm', 'nrgy', 'dnce', 'dB', 'live', 'val', 'dur', 'acous', 'spch', 'pop']
df_selected = df_train_nona[selected_columns]
selected_columns2 = ['year', 'bpm', 'nrgy', 'dnce', 'dB', 'live', 'val', 'dur', 'acous', 'spch']
df_test_selected = df_test[selected_columns2]
df_test_selected 

Unnamed: 0,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch
0,2005,154,93,65,-3,75,74,213,1,18
1,1994,161,39,30,-15,11,14,292,26,3
2,1977,64,46,27,-7,12,18,179,38,3
3,2010,127,92,71,-9,37,53,216,6,4
4,2018,115,46,56,-12,21,34,153,18,3
...,...,...,...,...,...,...,...,...,...,...
109,2005,125,57,61,-8,38,76,209,3,47
110,2010,130,89,67,-6,10,80,215,4,3
111,1994,84,58,78,-7,14,76,253,43,27
112,1978,127,97,72,-5,12,73,287,6,14


## Regression Models

### Linear Regression

In [12]:
X_train = df_selected.drop(columns=['pop'])
y_train = df_selected['pop']

X_test = df_test_selected

# Instantiate the linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

df_res = pd.DataFrame()
df_res['Id'] = df_test['Id']
df_res['pop'] = y_pred

df_res
df_res.to_csv('result_linearReg.csv', index=False)
# # Evaluate the model
# mse = mean_squared_error(y_test, y_pred)
# print("Mean Squared Error:", mse)

# # Optionally, you can also print the coefficients and intercept of the model
# print("Coefficients:", model.coef_)
# print("Intercept:", model.intercept_)

# # Calculate R-squared
# r_squared = r2_score(y_test, y_pred)
# print("R-squared:", r_squared)

In [13]:
y_train

0      44
2      77
3      67
4      63
5      74
       ..
448    45
449    50
450    77
451    62
452    59
Name: pop, Length: 438, dtype: int64

### Logistic Regression

In [29]:
# Initialize the logistic regression model
logreg = LogisticRegression()

# Fit the model to the training data
logreg.fit(X_train, y_train)

# Predict on the test data
y_pred = logreg.predict(X_test)

df_res = pd.DataFrame()
df_res['Id'] = df_test['Id']
df_res['pop'] = y_pred

df_res
df_res.to_csv('result_logReg.csv', index=False)

# # Calculate accuracy
# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy:", accuracy)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Polynomial Regression

In [30]:
degree = 2  # Change this to any desired degree

# Transform the features to polynomial features
poly_features = Pipeline([
        ("poly_features", PolynomialFeatures(degree=degree)),
        ("scaler", StandardScaler()),
        ("lin_reg", LinearRegression())
    ])

# X_train_poly = poly_features.fit_transform(X_train)
# X_test_poly = poly_features.transform(X_test)

# Instantiate the linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
predictions = model.predict(X_test)

df_res = pd.DataFrame()
df_res['Id'] = df_test['Id']
df_res['pop'] = predictions

df_res
df_res.to_csv('result_poly.csv', index=False)

# # Evaluate the model
# mse_poly = mean_squared_error(y_test, predictions)
# print("Mean Squared Error:", mse)
# from sklearn.metrics import r2_score

# rmse_poly = math.sqrt(mse_poly)
# print("Root Mean Squared Error (Lasso):", rmse_poly) 


# r_squared = r2_score(y_test, predictions)
# print("R-squared:", r_squared)

### Lasso Regression

In [31]:
# Instantiate the Lasso regression model
lasso_model = Lasso(alpha =0.1, random_state=1)  
# Train the model
lasso_model.fit(X_train, y_train)

# Predict on the test set
lasso_predictions = lasso_model.predict(X_test)

# # Evaluate the model
# mse_lasso = mean_squared_error(y_test, lasso_predictions)
# print("Mean Squared Error (Lasso):", mse_lasso)
# rmse_lasso = math.sqrt(mse_lasso)
# print("Root Mean Squared Error (Lasso):", rmse_lasso) 

# # Optionally, you can also print the coefficients and intercept of the model
# print("Coefficients (Lasso):", lasso_model.coef_)
# print("Intercept (Lasso):", lasso_model.intercept_)

# r_squared = r2_score(y_test, lasso_predictions)
# print("R-squared (Lasso):", r_squared)

df_res = pd.DataFrame()
df_res['Id'] = df_test['Id']
df_res['pop'] = lasso_predictions

df_res
df_res.to_csv('result_lasso.csv', index=False)

## PCA Regression

In [32]:
# Apply PCA
pca = PCA(n_components=10)  # Specify the number of components you want to keep
X_pca = pca.fit_transform(X_train)

# Fit linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the testing set
y_pred = model.predict(X_test)

df_res = pd.DataFrame()
df_res['Id'] = df_test['Id']
df_res['pop'] = y_pred

df_res
df_res.to_csv('result_pca.csv', index=False)

## Random Forest Regression

In [33]:
# Initialize the Random Forest Regressor
rf_regressor = RandomForestRegressor(random_state=42)

# Train the model
rf_regressor.fit(X_train, y_train)

# Predict on the testing set
y_pred = rf_regressor.predict(X_test)

df_res = pd.DataFrame()
df_res['Id'] = df_test['Id']
df_res['pop'] = y_pred

df_res
df_res.to_csv('result_random_forest.csv', index=False)

In [34]:
# Define the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform k-fold cross-validation and obtain predictions
y_pred_train = cross_val_predict(rf_model, X_train, y_train, cv=kf)

# Train the model on the entire training set
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_test = rf_model.predict(X_test)

df_res = pd.DataFrame()
df_res['Id'] = df_test['Id']
df_res['pop'] = y_pred_test

df_res
df_res.to_csv('result_random_forest.csv', index=False)

## Ridge Regression

In [35]:
# Apply feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Generate polynomial features
poly = PolynomialFeatures(degree=2)  # Try different degrees
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

# Train Ridge regression model
ridge_reg = Ridge(alpha=0.1)  # Regularization strength, tune this parameter
ridge_reg.fit(X_train_poly, y_train)

# Make predictions
y_pred = ridge_reg.predict(X_test_poly)

df_res = pd.DataFrame()
df_res['Id'] = df_test['Id']
df_res['pop'] = y_pred

df_res
df_res.to_csv('result_ridge.csv', index=False)