In [None]:
import pandas as pd
from numpy import cov
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from scipy.stats import spearmanr
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from datetime import datetime

In [None]:
data=pd.read_csv("../Data/clean_data_tracks_by_artists_genres.csv")
data

In [None]:
data_filtered=data[["genre","artist_name","song_id","song_popularity", "album_year_release"]]

In [None]:
#filtering out the years that are not in consecutive order and 2024 because new songs are being releasing
drop_years=[1952,1954,1956,1957,1958,2024]
data_filtered=data_filtered[~data_filtered["album_year_release"].isin(drop_years)].reset_index(drop=True)
data_filtered

In [None]:
group=data_filtered.groupby(["album_year_release", "genre"]).agg(amount_artists=("artist_name","nunique"),amount_songs=("song_id","nunique"), avg_song_popularity=("song_popularity","mean"))

In [None]:
data_model=group.reset_index().round()

In [None]:
#data_model.to_csv("../Data/data_model.csv", index=False)
data_model

## Graphs and checking correlations
in order to chose the model

In [None]:
plt.scatter(data_model.album_year_release, data_model.genre, data_model.amount_artists)


In [None]:
plt.scatter(data_model.album_year_release, data_model.genre, data_model.amount_songs)


In [None]:
plt.scatter(data_model.avg_song_popularity,data_model.genre)


In [None]:
#There is some postive relation but lets check in the other line the ceficient of it:
print(cov(data_model["avg_song_popularity"], data_model["amount_artists"]))


In [None]:
## Coeficient of correlation 0.24, close to zero, meaning that there is some positive linear relationship between song_popularity and amount_artists but it is weak
## P_value less than 0.05, we accept that the pearsonr's coeficient result is a statistically significat
correlation_coef, p_value = pearsonr(data_model.avg_song_popularity, data_model.amount_artists)
correlation_coef, p_value

In [None]:
# There is some positive monotonic relationship is not a strong one, but it is significant, following the P_value result, being it less than 0.05
spearmanr(data_model.avg_song_popularity, data_model.amount_artists)


In [None]:
## Coeficient of correlation 0.23, close to zero, meaning that there is some positive linear relationship between song_popularity and amount_songs but it is weak
## and it is less than the correlation with amount_artists
## P_value less than 0.05, we accept that the pearsonr's coeficient result is a statistically significat
correlation_coef, p_value = pearsonr(data_model.avg_song_popularity, data_model.amount_songs)
correlation_coef, p_value

In [None]:
# There is some positive monotonic relationship (less than the monotonic relationship with amount_artist) is not a strong one, but it is significant, following the P_value result, being it less than 0.05
spearmanr(data_model.avg_song_popularity, data_model.amount_songs)

In [None]:
data_model[['amount_artists', 'amount_songs']].corrwith(data_model["avg_song_popularity"], method="spearman")

## Randonforest Regressor

###  Data_feautres, splitting data for testing and training

In [None]:
data_feautures=data_model

In [None]:
oh_encoding=pd.get_dummies(data_feautures["genre"])
data_feautures["album_year_release"]=data_feautures["album_year_release"].apply(lambda x: (datetime.strptime(str(x), "%Y")).toordinal())
data_encoding=pd.concat([data_feautures,oh_encoding], axis=1)
data_encoding.drop("genre", axis=1, inplace=True)
data_encoding

In [None]:
data_features=data_encoding.drop("avg_song_popularity", axis=1)
target=data_feautures.avg_song_popularity
#splitting data for test - train model
X_train, X_test, y_train, y_test = train_test_split(data_features, target, test_size = 0.20, random_state=0)


In [None]:
random_forest = RandomForestRegressor()
random_forest.fit(X_train, y_train)

In [None]:
prediction_ = random_forest.predict(X_test)
print("R2 score", random_forest.score(X_test, y_test))

In [None]:
print("MAE", mean_absolute_error(prediction_, y_test))
print("RMSE", mean_squared_error(prediction_, y_test, squared=False))
print("R2 score", random_forest.score(X_train, y_train))



### Data_feautres without splitting

In [None]:
X=data_encoding.drop("avg_song_popularity", axis=1)

In [None]:
ranf = RandomForestRegressor()
ranf.fit(X, data_feautures.avg_song_popularity)

In [None]:
prediction = ranf.predict(X)

In [None]:
print("MAE", mean_absolute_error(prediction, data_feautures.avg_song_popularity))
print("RMSE", mean_squared_error(prediction, data_feautures.avg_song_popularity, squared=False))
print("R2 score", ranf.score(X, data_feautures.avg_song_popularity))

In [None]:
#LinePlot with original data in axis_y
sns.lineplot(data=data_model, x="album_year_release", y="avg_song_popularity", hue="genre")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
#LinePlot with predicted data in axis_y
sns.lineplot(data=data_feautures, x="album_year_release", y=prediction, hue="genre")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

### some testing with future dates (no real-data)

In [None]:
test=pd.read_csv("../Data/trial.csv")
test["album_year_release"]=test["album_year_release"].apply(lambda x: (datetime.strptime(str(x), "%Y")).toordinal())
test

In [None]:
#prediction using model, without splitting data:
#True: 2024=alternative metal, 2025=corrido, 2026=latin pop, 2027=urban latino, 2028=rap
prediction_test_ranf = ranf.predict(test)
prediction_test_ranf

In [None]:
#prediction using model, splitting data:
#True: 2024=alternative metal, 2025=corrido, 2026=latin pop, 2027=urban latino, 2028=rap
prediction_test_random_forest = random_forest.predict(test)
prediction_test_random_forest

### changing test, info

In [None]:
year_=datetime.strptime("2050", "%Y")

test["album_year_release"]=year_.toordinal()
test["amount_songs"]=1
test

In [None]:
#prediction without splitting data:
prediction_test_2050_ranf = ranf.predict(test)
prediction_test_2050_ranf

In [None]:
#prediction splitting data:
prediction_test_2050_random_forest = random_forest.predict(test)
prediction_test_2050_random_forest

## Random Forest Classifier

In [None]:
value_genre=(list(range(24)))
genres_list=(data_feautures.genre.unique().tolist())
dict_genres_value=dict(zip(genres_list,value_genre))
dict_genres_value

In [None]:
data_model["genre"]=data_model["genre"].apply(lambda x: dict_genres_value[x] if x in dict_genres_value.keys() else np.nan )

In [None]:
data_use_model=data_model.drop("genre", axis=1)
target_genre=data_model.genre

#splitting data for test - train model
X_train_categ, X_test_categ, y_train_categ, y_test_categ = train_test_split(data_use_model, target_genre, test_size = 0.20, random_state=0)


In [None]:
random_forest_clas = RandomForestClassifier()
random_forest_clas.fit(X_train_categ,  y_train_categ)

In [None]:
prediction_categ = random_forest_clas.predict(X_test_categ)
print("R2 score", random_forest_clas.score(X_test_categ, y_test_categ))

In [None]:
print("MAE", mean_absolute_error(prediction_categ, y_test_categ))
print("RMSE", mean_squared_error(prediction_categ, y_test_categ, squared=False))
print("R2 score", random_forest_clas.score(X_train_categ, y_train_categ))