# Notebook Summary

In this notebook I optimize a KNN model

In [5]:
import autoreload
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
import Classes

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [314]:
import pickle
from sklearn.linear_model import LogisticRegression
import sklearn.model_selection
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import ADASYN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import make_scorer
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

# Pickle in Data

In [13]:
# Pickle in factorized data

path = r"C:\Users\Andrew\Documents\Metis\TikTok_Hit_Predictor\Pickle\supervised_factorized.pkl"

df = pickle.load(open(path,'rb'))
df.head(2)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,year,spotify_artists,success
0,0.88,0.501,2.0,-6.774,1.0,0.062,0.0494,0.0695,0.436,0.459,120.038,2020.0,0,1.0
1,0.935,0.454,1.0,-7.509,1.0,0.375,0.0194,0.0,0.0824,0.357,133.073,2018.0,1,1.0


# Part 1 Including Artists

# CV KNN

In [386]:
# Seperate features from label

X = df.loc[:,['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo','year','spotify_artists']]

y = df['success']

In [387]:
#Split data into 2: 80% train, 8% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [388]:
# Defining oversample and KNN

ada = ADASYN(random_state=42)

# Define KNN
knn = KNeighborsClassifier()

# Define Standard Scaler 
scaler = StandardScaler()

In [389]:
# Make scorer
f1 = make_scorer(f1_score)

In [390]:
# define the parameter values that should be searched
k_range = list(range(1, 31))

# list of weight options
weight_options = ['uniform', 'distance']

In [391]:
pipeline = Pipeline([('scale', scaler), ('sampling', ada), ('class', knn)])

In [401]:
# create a parameter grid: map the parameter names to the values that should be searched
# dictionary = dict(key=values, key=values)
param_grid = dict(class__n_neighbors=k_range, class__weights=weight_options)
print(param_grid)

{'class__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], 'class__weights': ['uniform', 'distance']}


In [402]:
# instantiate and fit the grid
# exhaustive grid-search because it's trying every combination
# 10-fold cross-validation is being performed 30 x 2 = 60 times

grid = GridSearchCV(pipeline, param_grid, cv=10, scoring='f1',n_jobs=-1)

In [403]:
grid.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('scale', StandardScaler()),
                                       ('sampling', ADASYN(random_state=42)),
                                       ('class', KNeighborsClassifier())]),
             n_jobs=-1,
             param_grid={'class__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                                                11, 12, 13, 14, 15, 16, 17, 18,
                                                19, 20, 21, 22, 23, 24, 25, 26,
                                                27, 28, 29, 30],
                         'class__weights': ['uniform', 'distance']},
             scoring='f1')

In [404]:
#F1 score
grid.best_score_

0.5245943072911017

In [405]:
grid.best_params_

{'class__n_neighbors': 6, 'class__weights': 'uniform'}