In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

df = pd.read_csv("final_data_removed_lowcounts.csv")
del df['Unnamed: 0']
del df['movieId']
df

Unnamed: 0,count,avg_rating,director,genre,year,LeadActor,language
0,49695.0,3.921240,John Lasseter,Animation,1995,Tom Hanks,English
1,22243.0,3.211977,Joe Johnston,Adventure,1995,Robin Williams,English
2,12735.0,3.151040,Howard Deutch,Comedy,1995,Walter Matthau,English
3,2756.0,2.861393,Forest Whitaker,Comedy,1995,Whitney Houston,English
4,12161.0,3.064592,Charles Shyer,Comedy,1995,Steve Martin,English
...,...,...,...,...,...,...,...
17225,12.0,2.666667,Jaume Collet-Serra,Action,2015,Liam Neeson,English
17226,21.0,3.404762,Kenneth Branagh,Drama,2015,Cate Blanchett,English
17227,17.0,2.500000,Chris Buck,Animation,2015,Kristen Bell,English
17228,25.0,3.240000,Robert Schwentke,Action,2015,Kate Winslet,English


# WHAT MUST BE DONE
1) create X (all data except movieId and rating) and Y (rating only) dataframes <br>
2) map Y values onto some finite set, e.g. 0-0.5 => 0, 0.5-1 => 1, etc. ('outputs' for neural net) <br>
3) encode X data (specifically director, LeadActor, genre, language) <br>
4) split into testing and training and do np.ravel or something on Y data so it is useable <br>
5) fit neural net to create models and make predictions, check accuracy <br>
6) check accuracy or whatever

In [2]:
#Can't do pd.get_dummies because the dataframe is too large --> maybe can with removed_lowcounts csv
X = df.copy()
del X['avg_rating']
#del X['count'] <--> NOT deleting because amount of times movies are rated has significance

X = pd.get_dummies(X, columns=['year','director','genre','LeadActor','language'])
X

Unnamed: 0,count,year_1894,year_1902,year_1903,year_1910,year_1914,year_1915,year_1916,year_1917,year_1918,...,language_Tibetan,language_Tswana,language_Turkish,language_Ukrainian,language_Ukrainian Sign Language,language_Urdu,language_Vietnamese,language_Welsh,language_Wolof,language_Zulu
0,49695.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,22243.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,12735.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2756.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,12161.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17225,12.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17226,21.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17227,17.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17228,25.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
Y = df[['avg_rating']].copy()
#mapping different ratings to create neural net
#now the net will predict which range the movie rating will fall in
"""
for i in range(0, len(Y)):
    if Y.loc[i, 'avg_rating'] <= 0.5:
        Y.loc[i, 'avg_rating'] = 0
    elif Y.loc[i, 'avg_rating'] <= 1.0:
        Y.loc[i, 'avg_rating'] = 1
    elif Y.loc[i, 'avg_rating'] <= 1.5:
        Y.loc[i, 'avg_rating'] = 2
    elif Y.loc[i, 'avg_rating'] <= 2.0:
        Y.loc[i, 'avg_rating'] = 3
    elif Y.loc[i, 'avg_rating'] <= 2.5:
        Y.loc[i, 'avg_rating'] = 4
    elif Y.loc[i, 'avg_rating'] <= 3.0:
        Y.loc[i, 'avg_rating'] = 5
    elif Y.loc[i, 'avg_rating'] <= 3.5:
        Y.loc[i, 'avg_rating'] = 6
    elif Y.loc[i, 'avg_rating'] <= 4.0:
        Y.loc[i, 'avg_rating'] = 7
    elif Y.loc[i, 'avg_rating'] <= 4.5:
        Y.loc[i, 'avg_rating'] = 8
    else:
        Y.loc[i, 'avg_rating'] = 9
"""

#mapping ratings to create binary conditions for classifier
for i in range(0, len(Y)):
    if Y.loc[i, 'avg_rating'] < 3.5:
        Y.loc[i, 'avg_rating'] = 0
    else:
        Y.loc[i, 'avg_rating'] = 1

Y

Unnamed: 0,avg_rating
0,1.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
17225,0.0
17226,0.0
17227,0.0
17228,0.0


In [4]:
#split data into training and testing
#+ then alter the Y data so it can be used correctly
x_train, x_test, y_train_intermediate, y_test_intermediate = train_test_split(X, Y, test_size=0.3, random_state=35)
y_train = np.ravel(y_train_intermediate)
y_test = np.ravel(y_test_intermediate)

In [5]:
#create the classifier and fit it onto the training data
#THESE ARE JUST RANDOM VALUES, WE NEED TO TRY OTHERS TO GET A BETTER FIT
clf1 = MLPClassifier(hidden_layer_sizes=(250,150), random_state=35, activation='logistic')
clf1.fit(x_train, y_train)

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
              beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(250, 150), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=35, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [6]:
clf2 = MLPClassifier(hidden_layer_sizes=(4,3), random_state=35, activation='logistic')
clf2.fit(x_train, y_train)

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
              beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(4, 3), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=35, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [7]:
#make some predictions and test the accuracy
from sklearn.metrics import accuracy_score

y_pred1 = clf1.predict(x_test)
y_pred2 = clf2.predict(x_test)

print("Accuracy for first classifier: ",accuracy_score(y_test, y_pred1))
print("Accuracy for second classifier: ", accuracy_score(y_test, y_pred2))

Accuracy for first classifier:  0.7252853550009674
Accuracy for second classifier:  0.7341845618107952
