# Notebook Summary

In this notebook I optimize a KNN model

In [26]:
import autoreload
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
import Classes

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
import pickle
from sklearn.linear_model import LogisticRegression
import sklearn.model_selection
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import ADASYN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate

# Pickle in Data

In [28]:
# Pickle in cleaned dataframe

# Designate path

path = r"C:\Users\Andrew\Documents\Metis\TikTok_Song_Predictor\Pickle\df_agg.pkl"

df = pickle.load(open(path,'rb'))
df.head(2)

Unnamed: 0,level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,TikTok Link,Release Date,Position Change,spotify_uri,audio_analysis,feature_analysis,success,year,top_albums,top_artists
0,0,0.88,0.501,2.0,-6.774,1.0,0.062,0.0494,0.0695,0.436,...,https://www.tiktok.com/music/All-TikTok-Mashup...,2020-08-17,23.0,5TpvLkESnw1g9wDz52efeO,"{'meta': {'analyzer_version': '4.0.0', 'platfo...","{'danceability': 0.88, 'energy': 0.501, 'key':...",1,2020.0,Other,Other
1,162,0.935,0.454,1.0,-7.509,1.0,0.375,0.0194,0.0,0.0824,...,https://www.tiktok.com/music/WAP-Megan-Thee-St...,2018-03-22,15.0,4Oun2ylbjFKMPTiaSbbCih,"{'meta': {'analyzer_version': '4.0.0', 'platfo...","{'danceability': 0.935, 'energy': 0.454, 'key'...",1,2018.0,Other,Cardi B


# CV KNN

In [29]:
# Seperate features from label

X = df.loc[:,['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo','year']]

y = df['success']

In [30]:
#Split data into 3: 60% train, 20% validation, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [31]:
# Oversample

ada = ADASYN(random_state=42)

# Oversample training data
X_adasyn_tr, y_adasyn_tr = ada.fit_resample(X_train,y_train)

In [32]:
# Scale features 
scaler = StandardScaler()
X_adasyn_tr = scaler.fit_transform(X_adasyn_tr)

In [33]:
# specify "parameter distributions" rather than a "parameter grid"
k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']
param_dist = dict(n_neighbors=k_range, weights=weight_options)

In [34]:
# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(n_neighbors=k_range, weights=weight_options)
print(param_grid)

{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], 'weights': ['uniform', 'distance']}


In [35]:
# Define KNN
knn = KNeighborsClassifier()

In [36]:
# n_iter controls the number of searches
rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10, random_state=42)
rand.fit(X_adasyn_tr, y_adasyn_tr)
rand.cv_results_

{'mean_fit_time': array([0.03697374, 0.0260958 , 0.0268012 , 0.02508183, 0.0245018 ,
        0.02511075, 0.02364514, 0.02460833, 0.02400606, 0.02503729]),
 'std_fit_time': array([0.01212002, 0.00277676, 0.00208139, 0.00150602, 0.0007656 ,
        0.00125184, 0.00166795, 0.00130978, 0.00165578, 0.00147555]),
 'mean_score_time': array([0.15117085, 0.12352729, 0.22245226, 0.2016248 , 0.14172611,
        0.22600586, 0.17039297, 0.22586064, 0.15679355, 0.20288796]),
 'std_score_time': array([0.0571168 , 0.01560528, 0.00494172, 0.00792378, 0.00296366,
        0.00793487, 0.0040042 , 0.01486671, 0.00453339, 0.0070317 ]),
 'param_weights': masked_array(data=['uniform', 'distance', 'uniform', 'distance',
                    'distance', 'uniform', 'distance', 'uniform',
                    'uniform', 'distance'],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_n_neighbors': 

In [37]:
# examine the best model
print(rand.best_score_)
print(rand.best_params_)
print(rand.best_estimator_)

0.8803621322898432
{'weights': 'uniform', 'n_neighbors': 1}
KNeighborsClassifier(n_neighbors=1)


# Applying best params

In [115]:
# Seperate features from label

X = df.loc[:,['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo','year']]

y = df['success']

In [116]:
#Split data into 3: 60% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [117]:
# Oversample

ada = ADASYN(random_state=42)

# Oversample training data
X_adasyn_tr, y_adasyn_tr = ada.fit_resample(X_train,y_train)
X_adasyn_test, y_adasyn_test = ada.fit_resample(X_test,y_test)

In [118]:
# Scale features 
scaler = StandardScaler()
X_adasyn_tr = scaler.fit_transform(X_adasyn_tr)
X_adasyn_test = scaler.fit_transform(X_adasyn_test)

In [119]:
# run initial logistic regression
# Define KNN
model = KNeighborsClassifier(n_neighbors = 10, weights = 'uniform')
model.fit(X_adasyn_tr, y_adasyn_tr)
y_predict = model.predict(X_adasyn_test)

#scores
print("Scores for the logisitc regression")
print("Training score: {:6.2f}%".format(100*model.score(X_adasyn_tr, y_adasyn_tr)))
print("Val set score: {:6.2f}%".format(100*model.score(X_adasyn_test, y_adasyn_test)))

#precision/recall
print("\nPrecision / Recall val")
print("Val F1 score: {:6.2f}%".format(f1_score(model.predict(X_adasyn_test), y_adasyn_test)))
print("Precision: {:6.2f}%,   Recall: {:6.2f}%".format(precision_score(y_adasyn_test, y_predict), 
                                                     recall_score(y_adasyn_test, y_predict)))

Scores for the logisitc regression
Training score:  86.31%
Val set score:  73.48%

Precision / Recall val
Val F1 score:   0.75%
Precision:   0.72%,   Recall:   0.78%


Our randomized search cv tests seemed to have been to wide - n_neighbors = 10, weights = 'uniform' and result in the best score

# Pickle out model

In [121]:
# Pickle model
path = r"C:\Users\Andrew\Documents\Metis\TikTok_Song_Predictor\Pickle\knn_model.pkl"
pickle.dump(model, open(path, 'wb'))