# Notebook Summary
In this notebook I test various decision tree based models

In [1]:
import autoreload
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
import Classes

In [2]:
import pickle
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, fbeta_score, accuracy_score
from imblearn.over_sampling import ADASYN
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

# Pickle in Data

In [3]:
# Pickle in factorized data

path = r"C:\Users\Andrew\Documents\Metis\TikTok_Hit_Predictor\Pickle\supervised_factorized.pkl"

df_factorized = pickle.load(open(path,'rb'))
df_factorized.head(2)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,year,spotify_artists,success
0,0.88,0.501,2.0,-6.774,1.0,0.062,0.0494,0.0695,0.436,0.459,120.038,2020.0,0,1.0
1,0.935,0.454,1.0,-7.509,1.0,0.375,0.0194,0.0,0.0824,0.357,133.073,2018.0,1,1.0


In [4]:
# Pickle in dummied data

path = r"C:\Users\Andrew\Documents\Metis\TikTok_Hit_Predictor\Pickle\supervised_dummy.pkl"

df_dummy = pickle.load(open(path,'rb'))
df_dummy.head(2)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,spotify_artists_Yes,spotify_artists_Yusuf / Cat Stevens,spotify_artists_Yves Montand,spotify_artists_ZZ Top,spotify_artists_Zofia Dromlewiczowa,spotify_artists_blink-182,spotify_artists_other,spotify_artists_Трумен Капоте,spotify_artists_Эрих Мария Ремарк,spotify_artists_Эрнест Хемингуэй
0,0.88,0.501,2.0,-6.774,1.0,0.062,0.0494,0.0695,0.436,0.459,...,0,0,0,0,0,0,1,0,0,0
1,0.935,0.454,1.0,-7.509,1.0,0.375,0.0194,0.0,0.0824,0.357,...,0,0,0,0,0,0,1,0,0,0


# 1.A) Random Forest - Factorized artists

In [5]:
# Seperate features from label

X = df_factorized.loc[:,['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo','year','spotify_artists']]

y = df_factorized['success']

I elect to only use a train / test split due to a limited dataset ~7,500 rows

In [40]:
#Split data into 3: 80% train,20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [41]:
# Oversample training data
ada = ADASYN(random_state=42)
X_adasyn_tr, y_adasyn_tr = ada.fit_resample(X_train,y_train)

In [8]:
# run random forest
clf = RandomForestClassifier()
clf.fit(X_adasyn_tr, y_adasyn_tr)
y_predict = clf.predict(X_test)

In [9]:
Classes.train_scores(clf,X_adasyn_tr, y_adasyn_tr)

Train Scores
Accuracy score: 100.00%


In [10]:
Classes.test_scores(y_test,y_predict)

Test Scores
Score:  93.16%
F1 score:  76.82%
Precision:  66.16%,  Test Recall:  91.58%


# 1.B) Random Forest - Dummy artists

In [22]:
# Seperate features from label

X = df_dummy.drop(['success'],axis=1)

y = df_dummy['success']

In [23]:
#Split data into 3: 80% train,20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [24]:
# Oversample training data
ada = ADASYN(random_state=42)
X_adasyn_tr, y_adasyn_tr = ada.fit_resample(X_train,y_train)

In [25]:
# run random forest
clf = RandomForestClassifier()
clf.fit(X_adasyn_tr, y_adasyn_tr)
y_predict = clf.predict(X_test)

In [26]:
Classes.train_scores(clf,X_adasyn_tr, y_adasyn_tr)

Train Scores
Accuracy score: 100.00%


In [27]:
Classes.test_scores(y_test,y_predict)

Test Scores
Score:  83.26%
F1 score:  32.90%
Precision:  32.64%,  Test Recall:  33.16%


It seems that we have dimensionality problems here and so the score is much lower

# 2) Balanced Random Forest Classifier 

In [28]:
# Seperate features from label

X = df_factorized.loc[:,['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo','year','spotify_artists']]

y = df_factorized['success']

In [29]:
#Split data into 3: 80% train,20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [30]:
model = BalancedRandomForestClassifier(n_estimators=10)

In [31]:
# run model
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

In [32]:
Classes.train_scores(model,X_train, y_train)

Train Scores
Accuracy score:  94.10%


In [33]:
Classes.test_scores(y_test,y_predict)

Test Scores
Score:  94.59%
F1 score:  81.68%
Precision:  70.34%,  Test Recall:  97.37%


# 3) Balanced Bagging Classifier

In [34]:
# Seperate features from label

X = df_factorized.loc[:,['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo','year','spotify_artists']]

y = df_factorized['success']

In [35]:
#Split data into 3: 60% train, 20% validation, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [36]:
# define model
model = BalancedBaggingClassifier()

In [37]:
# run model
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

In [38]:
Classes.train_scores(model,X_train, y_train)

Train Scores
Accuracy score:  94.66%


In [39]:
Classes.test_scores(y_test,y_predict)

Test Scores
Score:  94.33%
F1 score:  80.96%
Precision:  69.29%,  Test Recall:  97.37%
