# MACHINE LEARNING - Group 09

In [None]:
# INTRO
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import adjusted_rand_score, fowlkes_mallows_score
from sklearn.metrics import silhouette_score, homogeneity_score
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import r2_score, mean_squared_error

df = pd.read_csv('popularity_score_dataset.csv')

## Exploratory Data Analysis

In [None]:
# DROPPING DUPLICATES AFTER MIXING ARTIST AND TRACK NAME

df['artist_song'] = df['artists'] + ' - ' + df['track_name']
df = df.drop_duplicates(subset='artist_song')

artist_song_col = df.pop('artist_song')
df.insert(0, 'artist_song', artist_song_col)

In [None]:
# SEEING IF ENCODING ARTISTS MAKES SENSE

num_unique = df['artists'].nunique()

In [None]:
# DROPPING USELESS COLUMNS AND NULL VALUES

df.drop('Unnamed: 0', axis=1, inplace=True)
df.drop('album_name', axis=1, inplace=True)
df.drop('artists', axis=1, inplace=True)
df.drop('track_name', axis=1, inplace=True)
df.drop('track_id', axis=1, inplace=True)

df = df.dropna()

In [None]:
# ENCODING THE EXPLICIT FEATURE

le = LabelEncoder()
df['explicit'] = le.fit_transform(df['explicit'])

In [None]:
# DROPPING SONGS WITH POPULARITY = 0

df = df.drop(df[df['popularity'] == 0].index)

In [None]:
# ONE HOT ENCODING GENRES

encoder = OneHotEncoder()

# Fit and transform the categorical variable
one_hot_encoded = encoder.fit_transform(df[['track_genre']])

# Convert the sparse matrix to a DataFrame
one_hot_df = pd.DataFrame.sparse.from_spmatrix(one_hot_encoded)

# Assign meaningful column names
one_hot_df.columns = encoder.get_feature_names_out(['track_genre'])

# Concatenate the one-hot encoded DataFrame with the original DataFrame
df_encoded = pd.concat([df, one_hot_df], axis=1)

In [None]:
# ONE HOT ENCODING KEY

encoder = OneHotEncoder()

# Fit and transform the categorical variable
one_hot_encoded = encoder.fit_transform(df[['key']])

# Convert the sparse matrix to a DataFrame
one_hot_df = pd.DataFrame.sparse.from_spmatrix(one_hot_encoded)

# Assign meaningful column names
one_hot_df.columns = encoder.get_feature_names_out(['key'])

# Concatenate the one-hot encoded DataFrame with the original DataFrame
df_encoded = pd.concat([df_encoded, one_hot_df], axis=1)

In [None]:
# remove all the columns that contain null values
df_encoded = df_encoded.dropna()

## EDA (Extra)

In [None]:
# CREATING A CLEANED CSV

df.to_csv('popularity_cleaned.csv')

In [None]:
# EXAMINING THE DISTRIBUTION OF THE DATA BY USING PLOTS

features = ['popularity', 'duration_ms', 'danceability', 'energy', 'loudness', 'speechiness',
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
            'explicit', 'mode']

In [None]:
# HISTOGRAMS

for feature in features:
    plt.hist(df[feature], color='lightgreen', bins=30)
    plt.title(feature)
    plt.show()

In [None]:
# BOX PLOTS

for feature in features:
    plt.boxplot(df[feature], boxprops=dict(color='purple'),
                whiskerprops=dict(color='green'),
                medianprops=dict(color='green'))
    plt.title(feature)
    plt.show()

In [None]:
# INVESTIGATING THE RELATIONSHIPS BETWEEN THE FEATURES USING A CORRELATION MATRIX

corr = df.corr()

fig, ax = plt.subplots(figsize=(15,15))
sns.heatmap(corr, annot=True, cmap="PiYG", ax=ax)
plt.show()

## Clustering

In [None]:
# Drop the genre column
df_no_genre = df.drop('track_genre', axis=1)

# Extract the numerical features to be used for clustering
df_num = df_no_genre[features]

# Scale the numerical features
scaler = StandardScaler()
df_num_scaled = scaler.fit_transform(df_num)

# Create a new DataFrame with the scaled numerical features and the artist and track title columns
df_scaled = pd.DataFrame(df_num_scaled, columns=features)
df_scaled['artist_song'] = df_no_genre['artist_song']

# Perform K-means clustering
kmeans = KMeans(n_clusters=len(df['track_genre'].unique()), random_state=0).fit(df_scaled[features])

# Add the cluster labels to the DataFrame
df_scaled['cluster'] = kmeans.labels_

# Print the cluster centroids
centroids = pd.DataFrame(scaler.inverse_transform(kmeans.cluster_centers_), columns=features)
print(centroids)

In [None]:
X = df_scaled[features]

cluster_labels = kmeans.predict(df_scaled[features])
true_labels = df['track_genre'].values

# Silhouette score
silhouette_avg = silhouette_score(df_scaled[features], cluster_labels)
print("Silhouette Score:", silhouette_avg)

# Homogeneity score
homogeneity = homogeneity_score(true_labels, cluster_labels)
print("Homogeneity Score:", homogeneity)

In [None]:
# Select the features
clust = X.iloc[:, 1:]

# Perform clustering
kmeans = KMeans(n_clusters=113, random_state=42).fit(clust)

# Reduce the number of dimensions to 2 using PCA
pca = PCA(n_components=2)
clust_pca = pca.fit_transform(clust)

# Set the size of the figure
plt.figure(figsize=(15, 15))

# Plot the clusters
plt.scatter(clust_pca[:, 0], clust_pca[:, 1], c=kmeans.labels_)

# Set the labels for the plot
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('K-Means Clustering Results')

# Show the plot
plt.show()

## Regression Models

In [None]:
# SPLITTING INTO TRAIN AND TEST 
X = df_encoded.drop(['popularity', 'artist_song', 'track_genre'], axis=1)
y = df_encoded['popularity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# SCALING FEATURES
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# LINEAR REGRESSION
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_r2 = r2_score(y_test, lr_pred)
lr_rmse = mean_squared_error(y_test, lr_pred, squared=False)

print('Linear Regression R2 score:', lr_r2)
print('Linear Regression RMSE:', lr_rmse)

In [None]:
# DECISION TREE
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
dt_r2 = r2_score(y_test, dt_pred)
dt_rmse = mean_squared_error(y_test, dt_pred, squared=False)

print('Decision Tree R2 score:', dt_r2)
print('Decision Tree RMSE:', dt_rmse)

In [None]:
# RANDOM FOREST
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_r2 = r2_score(y_test, rf_pred)
rf_rmse = mean_squared_error(y_test, rf_pred, squared=False)

print('Random Forest R2 score:', rf_r2)
print('Random Forest RMSE:', rf_rmse)

In [None]:
# Support Vector Regression (SVR)
svr = SVR()
svr.fit(X_train_scaled, y_train)
svr_pred = svr.predict(X_test_scaled)
svr_r2 = r2_score(y_test, svr_pred)
svr_rmse = mean_squared_error(y_test, svr_pred, squared=False)

print('SVR R-squared:', svr_r2)
print('SVR RMSE:', svr_rmse)

In [None]:
# Gradient Boosting Regression
gbr = GradientBoostingRegressor()
gbr.fit(X_train_scaled, y_train)
gbr_pred = gbr.predict(X_test_scaled)
gbr_r2 = r2_score(y_test, gbr_pred)
gbr_rmse = mean_squared_error(y_test, gbr_pred, squared=False)

print('Gradient Boosting R-squared:', gbr_r2)
print('Gradient Boosting RMSE:', gbr_rmse)

In [None]:
# XGBOOST
import xgboost as xgb
from xgboost import XGBRegressor

# Create an XGBoost classifier
xgb_model = xgb.XGBRegressor()
# fit
xgb_model = xgb_model.fit(X_train_scaled,  y_train)

XGB_pred = xgb_model.predict(X_test_scaled)

# # calculate evaluation metrics
XGB_mse = mean_squared_error(y_test, XGB_pred)
XGB_rmse = np.sqrt(XGB_mse)
XGB_r2 = r2_score(y_test, XGB_pred)

print('R-squared:', XGB_r2)
print('RMSE:', XGB_rmse)

In [None]:
# XGBOOST WITH GRID SEARCH

from sklearn.model_selection import GridSearchCV
# Define the parameter grid for grid search
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 500, 1000],
}

# Perform grid search
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Use the best parameters to build the final model
final_model = xgb.XGBRegressor(**best_params)
final_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
XGB_GRID_pred = final_model.predict(X_test_scaled)

# # calculate evaluation metrics
XGB_GRID_mse = mean_squared_error(y_test, XGB_GRID_pred)
XGB_GRID_rmse = np.sqrt(XGB_GRID_mse)
XGB_GRID_r2 = r2_score(y_test, XGB_GRID_pred)

print('R-squared:', XGB_GRID_r2)
print('RMSE:', XGB_GRID_rmse)

## Classification

In [None]:
# Define the target variable as popularity class
df_encoded['popularity_class'] = pd.qcut(df_encoded['popularity'], q=4, labels=False)

# Split into train and test sets
X = df_encoded.drop(['popularity', 'popularity_class', 'artist_song', 'track_genre'], axis=1)
y = df_encoded['popularity_class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)
rf_pred = rf_classifier.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, rf_pred)
precision = precision_score(y_test, rf_pred, average='weighted')
f1 = f1_score(y_test, rf_pred, average='weighted')

# Print the evaluation metrics
print('Accuracy:', accuracy)
print('Precision:', precision)
print('F1-score:', f1)

## Confusion Matrix

In [None]:
# Calculate the confusion matrix

conf_matrix = confusion_matrix(y_test, rf_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',)