# Notebook Summary
In this notebook I test various decision tree based models

In [1]:
import autoreload
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
import Classes

In [2]:
import pickle
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, fbeta_score
from imblearn.over_sampling import ADASYN
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

# Pickle in Data

In [3]:
# Pickle in cleaned dataframe

# Designate path

path = r"C:\Users\Andrew\Documents\Metis\TikTok_Song_Predictor\Pickle\df_agg.pkl"

df = pickle.load(open(path,'rb'))
df.head(2)

Unnamed: 0,level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,TikTok Link,Release Date,Position Change,spotify_uri,audio_analysis,feature_analysis,success,year,top_albums,top_artists
0,0,0.88,0.501,2.0,-6.774,1.0,0.062,0.0494,0.0695,0.436,...,https://www.tiktok.com/music/All-TikTok-Mashup...,2020-08-17,23.0,5TpvLkESnw1g9wDz52efeO,"{'meta': {'analyzer_version': '4.0.0', 'platfo...","{'danceability': 0.88, 'energy': 0.501, 'key':...",1,2020.0,Other,Other
1,162,0.935,0.454,1.0,-7.509,1.0,0.375,0.0194,0.0,0.0824,...,https://www.tiktok.com/music/WAP-Megan-Thee-St...,2018-03-22,15.0,4Oun2ylbjFKMPTiaSbbCih,"{'meta': {'analyzer_version': '4.0.0', 'platfo...","{'danceability': 0.935, 'energy': 0.454, 'key'...",1,2018.0,Other,Cardi B


# 1) Random Forest

In [4]:
# Seperate features from label

X = df.loc[:,['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature','Album Name','Artist(s)', 'year']]

y = df['success']

In [5]:
# Factorize
Classes.gnumeric_func(X, ['Album Name','Artist(s)'])

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,Album Name,Artist(s),year
0,0.880,0.5010,2.0,-6.774,1.0,0.0620,0.049400,0.069500,0.4360,0.4590,120.038,4.0,0,0,2020.0
1,0.935,0.4540,1.0,-7.509,1.0,0.3750,0.019400,0.000000,0.0824,0.3570,133.073,4.0,1,1,2018.0
2,0.842,0.5970,2.0,-6.336,1.0,0.0627,0.002520,0.000000,0.1240,0.2980,145.992,4.0,2,2,2018.0
3,0.884,0.5460,2.0,-6.279,0.0,0.1170,0.269000,0.000008,0.0640,0.4900,113.236,3.0,3,3,2020.0
4,0.926,0.7620,1.0,-1.887,1.0,0.2050,0.000718,0.000000,0.1170,0.2770,127.931,4.0,4,4,2020.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7670,0.626,0.4150,2.0,-12.073,1.0,0.6030,0.559000,0.000000,0.0931,0.5940,99.609,4.0,7338,1259,0.0
7671,0.672,0.0304,2.0,-26.380,1.0,0.0381,0.288000,0.865000,0.0676,0.0682,90.037,4.0,7339,2637,0.0
7672,0.879,0.6420,0.0,-6.775,1.0,0.0570,0.096800,0.000002,0.1300,0.7420,129.938,4.0,7340,5394,0.0
7673,0.649,0.9310,1.0,-3.150,0.0,0.1810,0.009300,0.000000,0.7510,0.7440,153.645,4.0,7341,20,0.0


In [6]:
#Split data into 3: 60% train, 20% validation, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25,stratify=y_train)

In [7]:
# Oversample training data
ada = ADASYN(random_state=42)
X_adasyn_tr, y_adasyn_tr = ada.fit_resample(X_train,y_train)

In [8]:
# run random forest
clf = RandomForestClassifier()
clf.fit(X_adasyn_tr, y_adasyn_tr)
y_predict = clf.predict(X_val)

#scores
print("Scores for the clf")
print("Training score: {:6.2f}%".format(100*clf.score(X_adasyn_tr, y_adasyn_tr)))
print("Val set score: {:6.2f}%".format(100*clf.score(X_val, y_val)))

#precision/recall
print("\nPrecision / Recall")
print("Val F1 score: {:6.2f}%".format(f1_score(clf.predict(X_val), y_val)))
print("Precision: {:6.2f}%,   Recall: {:6.2f}%".format(100*precision_score(y_val, y_predict), 
                                                     100*recall_score(y_val, y_predict)))

Scores for the clf
Training score: 100.00%
Val set score:  99.54%

Precision / Recall
Val F1 score:   0.98%
Precision:  96.92%,   Recall:  99.47%


The initial random forest yields an F1 roughly equivalent to the best logistic regression

# 2) Balanced Random Forest Classifier

In [9]:
# Seperate features from label

X = df.loc[:,['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature','Album Name','Artist(s)', 'year']]

y = df['success']

In [10]:
# Factorize
Classes.gnumeric_func(X, ['Album Name','Artist(s)'])

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,Album Name,Artist(s),year
0,0.880,0.5010,2.0,-6.774,1.0,0.0620,0.049400,0.069500,0.4360,0.4590,120.038,4.0,0,0,2020.0
1,0.935,0.4540,1.0,-7.509,1.0,0.3750,0.019400,0.000000,0.0824,0.3570,133.073,4.0,1,1,2018.0
2,0.842,0.5970,2.0,-6.336,1.0,0.0627,0.002520,0.000000,0.1240,0.2980,145.992,4.0,2,2,2018.0
3,0.884,0.5460,2.0,-6.279,0.0,0.1170,0.269000,0.000008,0.0640,0.4900,113.236,3.0,3,3,2020.0
4,0.926,0.7620,1.0,-1.887,1.0,0.2050,0.000718,0.000000,0.1170,0.2770,127.931,4.0,4,4,2020.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7670,0.626,0.4150,2.0,-12.073,1.0,0.6030,0.559000,0.000000,0.0931,0.5940,99.609,4.0,7338,1259,0.0
7671,0.672,0.0304,2.0,-26.380,1.0,0.0381,0.288000,0.865000,0.0676,0.0682,90.037,4.0,7339,2637,0.0
7672,0.879,0.6420,0.0,-6.775,1.0,0.0570,0.096800,0.000002,0.1300,0.7420,129.938,4.0,7340,5394,0.0
7673,0.649,0.9310,1.0,-3.150,0.0,0.1810,0.009300,0.000000,0.7510,0.7440,153.645,4.0,7341,20,0.0


In [11]:
#Split data into 3: 60% train, 20% validation, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25,stratify=y_train)

In [12]:
model = BalancedRandomForestClassifier(n_estimators=10)

In [13]:
# run model
model.fit(X_train, y_train)
y_predict = model.predict(X_val)

#scores
print("Scores for the random forest")
print("Training score: {:6.2f}%".format(100*model.score(X_train, y_train)))
print("Test set score: {:6.2f}%".format(100*model.score(X_val, y_val)))

#precision/recall
print("\nPrecision / Recall")
print("Val F1 score: {:6.2f}%".format(100*f1_score(y_predict, y_val)))
print("Val F2 score: {:6.2f}%".format(100*fbeta_score(y_predict, y_val,beta=0.5)))
print("Precision: {:6.2f}%".format(100*precision_score(y_val, y_predict)))
print("Recall: {:6.2f}%".format(100*recall_score(y_val, y_predict)))

Scores for the random forest
Training score:  99.44%
Test set score:  99.09%

Precision / Recall
Val F1 score:  96.45%
Val F2 score:  98.55%
Precision:  93.14%
Recall: 100.00%


An F1 score of 79% is the best so far - a few percentage points higher than the LR.  If there is no clear leader, I will need to go back and cross validate all scores to confirm.

# 3) Balanced Bagging Classifier

In [14]:
# Seperate features from label

X = df.loc[:,['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature','Album Name','Artist(s)', 'year']]

y = df['success']

In [15]:
# Factorize
Classes.gnumeric_func(X, ['Album Name','Artist(s)'])

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,Album Name,Artist(s),year
0,0.880,0.5010,2.0,-6.774,1.0,0.0620,0.049400,0.069500,0.4360,0.4590,120.038,4.0,0,0,2020.0
1,0.935,0.4540,1.0,-7.509,1.0,0.3750,0.019400,0.000000,0.0824,0.3570,133.073,4.0,1,1,2018.0
2,0.842,0.5970,2.0,-6.336,1.0,0.0627,0.002520,0.000000,0.1240,0.2980,145.992,4.0,2,2,2018.0
3,0.884,0.5460,2.0,-6.279,0.0,0.1170,0.269000,0.000008,0.0640,0.4900,113.236,3.0,3,3,2020.0
4,0.926,0.7620,1.0,-1.887,1.0,0.2050,0.000718,0.000000,0.1170,0.2770,127.931,4.0,4,4,2020.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7670,0.626,0.4150,2.0,-12.073,1.0,0.6030,0.559000,0.000000,0.0931,0.5940,99.609,4.0,7338,1259,0.0
7671,0.672,0.0304,2.0,-26.380,1.0,0.0381,0.288000,0.865000,0.0676,0.0682,90.037,4.0,7339,2637,0.0
7672,0.879,0.6420,0.0,-6.775,1.0,0.0570,0.096800,0.000002,0.1300,0.7420,129.938,4.0,7340,5394,0.0
7673,0.649,0.9310,1.0,-3.150,0.0,0.1810,0.009300,0.000000,0.7510,0.7440,153.645,4.0,7341,20,0.0


In [16]:
#Split data into 3: 60% train, 20% validation, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25,stratify=y_train)

In [17]:
# define model
model = BalancedBaggingClassifier()

In [18]:
# run model
model.fit(X_train, y_train)
y_predict = model.predict(X_val)

#scores
print("Scores for the random forest")
print("Training score: {:6.2f}%".format(100*model.score(X_train, y_train)))
print("Test set score: {:6.2f}%".format(100*model.score(X_val, y_val)))

#precision/recall
print("\nPrecision / Recall")
print("Val F1 score: {:6.2f}%".format(100*f1_score(y_predict, y_val)))
print("Val F2 score: {:6.2f}%".format(100*fbeta_score(y_predict, y_val,beta=0.5)))
print("Precision: {:6.2f}%".format(100*precision_score(y_val, y_predict)))
print("Recall: {:6.2f}%".format(100*recall_score(y_val, y_predict)))

Scores for the random forest
Training score:  99.41%
Test set score:  99.15%

Precision / Recall
Val F1 score:  96.69%
Val F2 score:  98.65%
Precision:  93.60%
Recall: 100.00%


Score is essentially the same as the balnced random forest