# Notebook
In this notebook I train a Neural Net

In [1]:
import autoreload
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
import Classes

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

In [3]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, fbeta_score, accuracy_score
from collections import Counter
from imblearn.over_sampling import ADASYN

# Pickle in Data

In [4]:
# Pickle in cleaned dataframe

# Designate path

path = r"C:\Users\Andrew\Documents\Metis\TikTok_Song_Predictor\Pickle\df_agg.pkl"

df = pickle.load(open(path,'rb'))
df.head(2)

Unnamed: 0,level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,TikTok Link,Release Date,Position Change,spotify_uri,audio_analysis,feature_analysis,success,year,top_albums,top_artists
0,0,0.88,0.501,2.0,-6.774,1.0,0.062,0.0494,0.0695,0.436,...,https://www.tiktok.com/music/All-TikTok-Mashup...,2020-08-17,23.0,5TpvLkESnw1g9wDz52efeO,"{'meta': {'analyzer_version': '4.0.0', 'platfo...","{'danceability': 0.88, 'energy': 0.501, 'key':...",1,2020.0,Other,Other
1,162,0.935,0.454,1.0,-7.509,1.0,0.375,0.0194,0.0,0.0824,...,https://www.tiktok.com/music/WAP-Megan-Thee-St...,2018-03-22,15.0,4Oun2ylbjFKMPTiaSbbCih,"{'meta': {'analyzer_version': '4.0.0', 'platfo...","{'danceability': 0.935, 'energy': 0.454, 'key'...",1,2018.0,Other,Cardi B


# 2) Model - class imbalance (directly altering class weight)

In [16]:
# Seperate features from label

X = df.loc[:,['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature', 'year']]

y = df['success']

In [17]:
#Split data into 3: 60% train, 20% validation, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [18]:
# Scale data

ss = StandardScaler()

x_train_scaled = ss.fit_transform(X_train)

x_test_scaled = ss.transform(X_test)

In [19]:
# Analyze imbalance
counts = Counter(y_train)
print(
    "Number of positive samples in training data: {} ({:.2f}% of total)".format(
        counts[1], 100 * float(counts[1]) / len(y_train)
    )
)

weight_for_0 = 1.0 / counts[0]
weight_for_1 = 1.0 / counts[1]

Number of positive samples in training data: 758 (12.35% of total)


In [20]:
# Our vectorized labels
#y_train = np.asarray(y_train).astype('float32').reshape((-1,1))
#y_test = np.asarray(y_test).astype('float32').reshape((-1,1))

In [21]:
# Define metrics
metrics = [
    keras.metrics.FalseNegatives(name="fn"),
    keras.metrics.FalsePositives(name="fp"),
    keras.metrics.TrueNegatives(name="tn"),
    keras.metrics.TruePositives(name="tp"),
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
]

In [22]:
model = keras.Sequential()
model.add(layers.InputLayer(input_shape=(13,)))
model.add(layers.Dense(100, activation='relu'))
model.add(layers.Dense(50, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 100)               1400      
_________________________________________________________________
dense_4 (Dense)              (None, 50)                5050      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 51        
Total params: 6,501
Trainable params: 6,501
Non-trainable params: 0
_________________________________________________________________


In [23]:
# Compile
model.compile(
    optimizer='adam', loss="binary_crossentropy", metrics=metrics
)

In [24]:
callbacks = [keras.callbacks.ModelCheckpoint("models/song_model_at_epoch_{epoch}.h5")]
class_weight = {0: weight_for_0, 1: weight_for_1}

In [25]:
# Fit on training data

history = model.fit(x_train_scaled, y_train,       
                    epochs=30,             
                    validation_split=.2,
                    verbose=2,
                    callbacks=callbacks,
                    class_weight = class_weight,
                   )

Epoch 1/30
154/154 - 2s - loss: 1.9356e-04 - fn: 144.0000 - fp: 1703.0000 - tn: 2609.0000 - tp: 456.0000 - precision: 0.2112 - recall: 0.7600 - val_loss: 0.5491 - val_fn: 10.0000 - val_fp: 488.0000 - val_tn: 582.0000 - val_tp: 148.0000 - val_precision: 0.2327 - val_recall: 0.9367
Epoch 2/30
154/154 - 0s - loss: 1.6740e-04 - fn: 48.0000 - fp: 1968.0000 - tn: 2344.0000 - tp: 552.0000 - precision: 0.2190 - recall: 0.9200 - val_loss: 0.6454 - val_fn: 5.0000 - val_fp: 560.0000 - val_tn: 510.0000 - val_tp: 153.0000 - val_precision: 0.2146 - val_recall: 0.9684
Epoch 3/30
154/154 - 0s - loss: 1.6527e-04 - fn: 46.0000 - fp: 1995.0000 - tn: 2317.0000 - tp: 554.0000 - precision: 0.2173 - recall: 0.9233 - val_loss: 0.5703 - val_fn: 7.0000 - val_fp: 511.0000 - val_tn: 559.0000 - val_tp: 151.0000 - val_precision: 0.2281 - val_recall: 0.9557
Epoch 4/30
154/154 - 0s - loss: 1.6257e-04 - fn: 59.0000 - fp: 1909.0000 - tn: 2403.0000 - tp: 541.0000 - precision: 0.2208 - recall: 0.9017 - val_loss: 0.5867 -

In [26]:
# Predict classes
yhat_classes = model.predict_classes(X_test, verbose=0)



In [27]:
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_test, yhat_classes)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test, yhat_classes)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, yhat_classes)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, yhat_classes)
print('F1 score: %f' % f1)

Accuracy: 0.523127
Precision: 0.205435
Recall: 0.994737
F1 score: 0.340541


# 3 ) Model - Oversampling

In [28]:
# Seperate features from label

X = df.loc[:,['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature', 'year']]

y = df['success']

In [29]:
#Split data into 3: 60% train, 20% validation, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [30]:
# Scale data

ss = StandardScaler()

x_train_scaled = ss.fit_transform(X_train)

x_test_scaled = ss.transform(X_test)

In [31]:
# Oversample training data
ada = ADASYN(random_state=42)
X_adasyn_tr, y_adasyn_tr = ada.fit_resample(x_train_scaled,y_train)

In [32]:
# Our vectorized labels
y_train = np.asarray(y_train).astype('float32').reshape((-1,1))
y_test = np.asarray(y_test).astype('float32').reshape((-1,1))

In [33]:
model = keras.Sequential()
model.add(layers.InputLayer(input_shape=(13,)))
model.add(layers.Dense(100, activation='relu'))
model.add(layers.Dense(50, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 100)               1400      
_________________________________________________________________
dense_7 (Dense)              (None, 50)                5050      
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 51        
Total params: 6,501
Trainable params: 6,501
Non-trainable params: 0
_________________________________________________________________


In [34]:
# Compile the model
model.compile(optimizer='adam', 
              loss="binary_crossentropy", 
              metrics=['accuracy'])

In [35]:
# Fit on training data

history = model.fit(x_train_scaled, y_train,       
                    epochs=500,             
                    validation_split=.2,
                    verbose=False,
                    callbacks=[keras.callbacks.ModelCheckpoint('models/songs_balanced.{epoch:02d}-{val_loss:.2f}.hdf5',
                                                               save_best_only=True)])

In [36]:
# Accuracy Scores
pred_train= model.predict(X_train)
scores = model.evaluate(X_train, y_train, verbose=0)
print('Accuracy on training data: {}% \n Error on training data: {}'.format(scores[1], 1 - scores[1]))   
 
pred_test= model.predict(X_test)
scores2 = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy on test data: {}% \n Error on test data: {}'.format(scores2[1], 1 - scores2[1]))    

Accuracy on training data: 0.5190553665161133% 
 Error on training data: 0.4809446334838867
Accuracy on test data: 0.5394136905670166% 
 Error on test data: 0.4605863094329834


In [37]:
# Predict classes
yhat_classes = model.predict_classes(X_test, verbose=0)



In [38]:
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_test, yhat_classes)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test, yhat_classes)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, yhat_classes)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, yhat_classes)
print('F1 score: %f' % f1)

Accuracy: 0.539414
Precision: 0.211817
Recall: 1.000000
F1 score: 0.349586
