<a href="https://www.kaggle.com/code/arenmramirez/spotify-reccomendation-project?scriptVersionId=142936470" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import geopandas as gpd

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

"""import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
"""
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Review

In [None]:
# read the file
rec = pd.read_csv("../input/spotify-dataset-19212020-160k-tracks/tracks.csv")
# Read track years data
track_years = pd.read_csv("../input/spotify-dataset-19212020-160k-tracks/data_by_year_o.csv")
# Read genres data
genre_data = pd.read_csv("../input/spotify-dataset-19212020-160k-tracks/data_by_genres_o.csv")
# look at comp
rec.info()

# drop duplicates
rec = rec.drop_duplicates()
rec.info()

In [None]:
rec.head()

In [None]:
# Review and fill Unknown values
rec.isnull().sum()

In [None]:
rec= rec.fillna("Unknown")

# Machine Learning Time

In [None]:
from sklearn.model_selection import train_test_split
# Assigning the Target and Train data
y = rec.popularity
# Feature columns to use in models
feature_cols = ['duration_ms','danceability','energy','key','loudness','mode','speechiness'
                ,'acousticness','instrumentalness','liveness','valence','tempo','time_signature']
X = rec[feature_cols]
# split the data into training data and validation data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.1, test_size=0.1,random_state=0)

In [None]:
X_train.info()

In [None]:
X_train.head()

# Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
"""
# Random Forest Regressor Model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train, y_train)
preds = model.predict(X_valid)
RFG_score = mean_absolute_error(y_valid, preds)

print("MAE: ")
print(RFG_score)
#MAE: 11.866759193770534
#Takes almost 80sec to run
"""

MAE w/58k entries and n_estimators=100
11.866759193770534

MAE w/117k entries and n_estimators=100
11.740377603551279 ... Took almost 3 min to execute

# Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
# define Decision Tree Regressor with best max_leaf_nodes
DTR_model = DecisionTreeRegressor(max_leaf_nodes=300,random_state=0)
DTR_model.fit(X_train, y_train)

# get prediction results
pred = DTR_model.predict(X_valid)
print(mean_absolute_error(y_valid, pred))

50k entries 12.398113725786954



117k entries 12.31314021431054

In [None]:
# definition used to get best max_leaf_nodes
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [None]:
# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500,5000]:
    my_mae = get_mae(max_leaf_nodes, X_train, X_valid, y_train, y_valid)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

No Difference with 117k entries same as 50k

**Cross validation takes too long on this data set**

# XGBoost

In [None]:
# Create base XGBoost model
from xgboost import XGBRegressor
XGB_model_1 = XGBRegressor()
XGB_model_1.fit(X_train, y_train)
predictions = XGB_model_1.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))


50k entries Mean Absolute Error: 11.982949820778812


117k entries Mean Absolute Error: 11.843299912217434

In [None]:
# XGBoost Model
XGB_model_2 = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)
XGB_model_2.fit(X_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(X_valid, y_valid)], 
             verbose=False)

In [None]:
# Get predictions
XGB_pred = XGB_model_2.predict(X_valid)

# Calculate MAE
XGB_mae = mean_absolute_error(XGB_pred, y_valid)

# Uncomment to print MAE
print("Mean Absolute Error:" , XGB_mae)

50k entries Mean Absolute Error: 11.88494225922454


117k entries Mean Absolute Error: 11.826382472073112

# Cross validation with training set

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer()),
                              ('model', RandomForestRegressor(n_estimators=100,
                                                              random_state=0))
                             ])

In [None]:
from sklearn.model_selection import cross_val_score
"""
# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(my_pipeline, X_train, y_train,
                              cv=5,
                              scoring='neg_mean_absolute_error')

# average mae scores for pipeline
print("Average MAE score (across experiments):")
print(scores.mean())
#print("MAE scores:\n", scores)
#Average MAE score (across experiments): 11.952080103882547
"""

Takes 300 sec to run

# Mutual Information

In [None]:
# Commented out to save memory for the time being
"""
discrete_features = X.dtypes == int
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X_valid, y_valid, discrete_features):
    mi_scores = mutual_info_regression(X_valid, y_valid, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X_valid.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores
"""

MI Scores take up 12gb of ram... consider removing later

In [None]:
"""
mi_scores = make_mi_scores(X_train, y_train, discrete_features)
mi_scores[::3]  # show a few features with their MI scores
mi_scores2 = make_mi_scores(X_valid, y_valid, discrete_features)
mi_scores2[::3]  # show a few features with their MI scores
"""

In [None]:
"""
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


plt.figure(dpi=100, figsize=(8, 5))
#plot_mi_scores(mi_scores)
plot_mi_scores(mi_scores2)
#Blue is mi_score and orange is mi_scores2
"""

In [None]:
#sns.relplot(x="popularity", y="acousticness", data=rec);

In [None]:
#sns.lmplot(x="energy", y="popularity", hue="fuel_type", data=rec);

# Permutation Importance

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(XGB_model_2, random_state=1).fit(X_valid, y_valid)

In [None]:
feature_names = [i for i in rec.columns if rec[i].dtype in [np.int64]]
eli5.show_weights(perm, feature_names = X_valid.columns.tolist())

# SHAP values

In [None]:
from catboost import Pool, CatBoostRegressor, cv
categorical_features_indices =np.where(X.dtypes == np.object)[0]

model = CatBoostRegressor(random_seed = 350,loss_function = 'RMSE',iterations=350)
#fitting the train data
model.fit(X_train, y_train,cat_features = categorical_features_indices,verbose=False)


In [None]:
errors = abs(predictions - y_valid)
mape = 100 * (errors / y_valid)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)


print('Accuracy_CatBoost:', round(accuracy, 2))

In [None]:
from sklearn import metrics
predictions=model.predict(X_valid)
print('Mean Absolute Error     MAE:', metrics.mean_absolute_error(y_valid, predictions))
print('Mean Squared Error      MSE:', metrics.mean_squared_error(y_valid, predictions))
print('Root Mean Squared Error RMSE:', np.sqrt(metrics.mean_squared_error(y_valid, predictions)))

In [None]:
import shap

shap_values = model.get_feature_importance(Pool(X_valid, label=y_valid, cat_features=categorical_features_indices),type="ShapValues")
shap_values = shap_values[:, :-1]
shap.summary_plot(shap_values, X_valid, plot_type="bar")

In [None]:
shap_values = model.get_feature_importance(Pool(X_valid, label=y_valid, cat_features=categorical_features_indices),type="ShapValues",)
expected_value = shap_values[0, -1]
shap_values = shap_values[:, :-1]
shap.initjs()  
 
shap.force_plot(expected_value, shap_values[5, :], X_valid.iloc[10, :])

# Graph Anaylsis

In [None]:
import calendar
rec['release_date'] = pd.to_datetime(rec['release_date'])
rec['release_year'] = rec['release_date'].dt.year
rec['release_month'] = rec['release_date'].dt.month
rec['release_month'] = rec['release_month'].apply(lambda x : calendar.month_name[x])

In [None]:
rec.tail()

In [None]:
import plotly.express as px 
import matplotlib.pyplot as plt
import seaborn as sns
# Most popular artists
pop_artists = rec.sort_values(by=['artists','release_year'], ascending=[False,True])
pop_artists = rec.groupby('artists').popularity.agg('max').reset_index().sort_values('popularity',ascending=False)
pop_artists.columns=['artists','max_popularity']
# Most popular songs
pop_songs = rec.sort_values(by=['name','release_year'], ascending=[False,True])
pop_songs = rec.groupby('name').popularity.agg('max').reset_index().sort_values('popularity',ascending=False)
pop_songs.columns=['name','max_popularity']
#pop_artists=pop_artists.set_index('artists')['explicit'].sum().sort_values(ascending=False)
#plt.figure(figsize=(10,7))
#sns.lineplot(data=pop_artists,linewidth=3).set(title="Popular Artists over time")

In [None]:
# Explicit vs non explicit popularity
e_v_n=rec.groupby('explicit').popularity.agg(['count','max']).reset_index().sort_values('explicit',ascending=False)
e_v_n.columns=['explicit','count','max_popularity']
e_v_n.head()

In [None]:
rec.groupby('artists')['explicit'].sum().sort_values(ascending = False).to_frame()[: 10].plot(kind = 'bar',
                                    figsize = (18,8), color = 'skyblue' ,title = 'Number of Explicit Songs')

In [None]:
top_genres = genre_data.groupby('genres').popularity.agg('max').reset_index().sort_values('popularity', ascending=[False])
top_genres.columns=['genres','max_popularity']
top_15_genres = top_genres.head(15)

plt.figure(figsize=(15,7))
genre_graph=sns.barplot(x=top_15_genres['genres'], y=top_15_genres['max_popularity'], edgecolor='black')
genre_graph.set_yticklabels(['0','10','20','30','40','60','70','80','90','100'])
genre_graph.set_xticklabels(genre_graph.get_xticklabels(), rotation=40, ha="right")
genre_graph.set(title="Top 15 Genres")

In [None]:
top_genres.head(15)

In [None]:
#Top 10 most popular artists
pop_artists.head(10)

In [None]:
#Top 10 most popular songs
pop_songs.head(10)

In [None]:
import plotly.express as px
sound_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'valence','speechiness']
fig=px.line(track_years, x='year', y=sound_features, title="Audio Characteristics over time")
fig.show()

In [None]:
rec.groupby('release_year')['explicit'].mean().plot(figsize = (18,6) ,
                kind = 'line' , color = 'red' , title = 'Rise of Explicit Content on Spotify')


In [None]:
features_to_plot = ['liveness' , 'danceability' , 'energy' , 'speechiness',
                    'acousticness' , 'instrumentalness' , 'valence','loudness' , 'tempo']
fig,ax = plt.subplots(3,3 , figsize =(18,12))
for i in range(3):
    for j in range(3):
        color = np.random.rand(3,)
        rec.groupby('artists')[features_to_plot[i+j]].mean().sort_values(ascending = False).to_frame()[: 5].plot(
            kind = 'bar' , color = color
              , title = f"top 5 artists for {features_to_plot[i+j]}"
              , ax = ax[i][j])
plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.9, 
                    wspace=0.4, 
                    hspace=1.8
                    )
plt.tight_layout()
plt.show()