In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, impute
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

df = pd.read_csv("final_data_removed_lowcounts.csv")
del df['Unnamed: 0']
del df['movieId']
df

Unnamed: 0,count,avg_rating,director,genre,year,LeadActor,language
0,49695.0,3.921240,John Lasseter,Animation,1995,Tom Hanks,English
1,22243.0,3.211977,Joe Johnston,Adventure,1995,Robin Williams,English
2,12735.0,3.151040,Howard Deutch,Comedy,1995,Walter Matthau,English
3,2756.0,2.861393,Forest Whitaker,Comedy,1995,Whitney Houston,English
4,12161.0,3.064592,Charles Shyer,Comedy,1995,Steve Martin,English
...,...,...,...,...,...,...,...
17225,12.0,2.666667,Jaume Collet-Serra,Action,2015,Liam Neeson,English
17226,21.0,3.404762,Kenneth Branagh,Drama,2015,Cate Blanchett,English
17227,17.0,2.500000,Chris Buck,Animation,2015,Kristen Bell,English
17228,25.0,3.240000,Robert Schwentke,Action,2015,Kate Winslet,English


In [2]:
#Can't do pd.get_dummies because the dataframe is too large --> maybe can with removed_lowcounts csv
X = df.copy()
del X['avg_rating']
#del X['count'] <--> NOT deleting because amount of times movies are rated has significance

X = pd.get_dummies(X, columns=['year','director','genre','LeadActor','language'])
X

Unnamed: 0,count,year_1894,year_1902,year_1903,year_1910,year_1914,year_1915,year_1916,year_1917,year_1918,...,language_Tibetan,language_Tswana,language_Turkish,language_Ukrainian,language_Ukrainian Sign Language,language_Urdu,language_Vietnamese,language_Welsh,language_Wolof,language_Zulu
0,49695.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,22243.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,12735.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2756.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,12161.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17225,12.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17226,21.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17227,17.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17228,25.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
Y = df[['avg_rating']].copy()

#mapping ratings to create binary conditions for classifier
for i in range(0, len(Y)):
    if Y.loc[i, 'avg_rating'] < 3.5:
        Y.loc[i, 'avg_rating'] = 0
    else:
        Y.loc[i, 'avg_rating'] = 1

Y

Unnamed: 0,avg_rating
0,1.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
17225,0.0
17226,0.0
17227,0.0
17228,0.0


In [4]:
x_train, x_test, y_train_intermediate, y_test_intermediate = train_test_split(X, Y, test_size=0.3, random_state=35)
y_train = np.ravel(y_train_intermediate)
y_test = np.ravel(y_test_intermediate)

In [5]:
knn = KNeighborsClassifier()
knn.fit(x_train,y_train)
y_pred_initial = knn.predict(x_test)

In [6]:
#create new a knn model using gscv
knn_params = {'n_neighbors':np.arange(1, 10), 'weights':['uniform','distance']}
gscv = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5)
gscv.fit(x_train, y_train)
print(gscv.best_params_)

{'n_neighbors': 8, 'weights': 'uniform'}


In [7]:
#use this method for getting the best parameters: gscv.best_params_.get('')
new_knn = KNeighborsClassifier(n_neighbors=gscv.best_params_.get('n_neighbors'),
                               weights=gscv.best_params_.get('weights'))
new_knn.fit(x_train, y_train)
y_pred = new_knn.predict(x_test)

print("Initial knn accuracy: ", accuracy_score(y_test, y_pred_initial))
print("Final accuracy", accuracy_score(y_test, y_pred))

Initial knn accuracy:  0.673244341265235
Final accuracy 0.6769201005997292


Simple Time Test

In [5]:
%%time
time_knn = KNeighborsClassifier().fit(x_train,y_train)
time_res = time_knn.predict(x_test)

Wall time: 1min 35s
