In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

df = pd.read_csv("final_data_removed_lowcounts.csv")
del df['Unnamed: 0']
del df['movieId']
df

Unnamed: 0,count,avg_rating,director,genre,year,LeadActor,language
0,49695.0,3.921240,John Lasseter,Animation,1995,Tom Hanks,English
1,22243.0,3.211977,Joe Johnston,Adventure,1995,Robin Williams,English
2,12735.0,3.151040,Howard Deutch,Comedy,1995,Walter Matthau,English
3,2756.0,2.861393,Forest Whitaker,Comedy,1995,Whitney Houston,English
4,12161.0,3.064592,Charles Shyer,Comedy,1995,Steve Martin,English
...,...,...,...,...,...,...,...
17225,12.0,2.666667,Jaume Collet-Serra,Action,2015,Liam Neeson,English
17226,21.0,3.404762,Kenneth Branagh,Drama,2015,Cate Blanchett,English
17227,17.0,2.500000,Chris Buck,Animation,2015,Kristen Bell,English
17228,25.0,3.240000,Robert Schwentke,Action,2015,Kate Winslet,English


In [2]:
#Can't do pd.get_dummies because the dataframe is too large --> maybe can with removed_lowcounts csv
X = df.copy()
del X['avg_rating']
#del X['count'] <--> NOT deleting because amount of times movies are rated has significance

X = pd.get_dummies(X, columns=['year','director','genre','LeadActor','language'])
X

Unnamed: 0,count,year_1894,year_1902,year_1903,year_1910,year_1914,year_1915,year_1916,year_1917,year_1918,...,language_Tibetan,language_Tswana,language_Turkish,language_Ukrainian,language_Ukrainian Sign Language,language_Urdu,language_Vietnamese,language_Welsh,language_Wolof,language_Zulu
0,49695.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,22243.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,12735.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2756.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,12161.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17225,12.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17226,21.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17227,17.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17228,25.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
Y = df[['avg_rating']].copy()

#mapping ratings to create binary conditions for classifier
for i in range(0, len(Y)):
    if Y.loc[i, 'avg_rating'] < 3.5:
        Y.loc[i, 'avg_rating'] = 0
    else:
        Y.loc[i, 'avg_rating'] = 1

Y

Unnamed: 0,avg_rating
0,1.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
17225,0.0
17226,0.0
17227,0.0
17228,0.0


In [5]:
x_train, x_test, y_train_intermediate, y_test_intermediate = train_test_split(X, Y, test_size=0.3, random_state=35)
y_train = np.ravel(y_train_intermediate)
y_test = np.ravel(y_test_intermediate)

In [6]:
rfc = RandomForestClassifier(random_state=35)
rfc.fit(x_train,y_train)

tree_params = {'n_estimators':[10,100,150],'max_leaf_nodes':[None,5,10,20],'max_depth':[None,10,20,50]}
gscv = GridSearchCV(rfc, tree_params, cv=3)
gscv.fit(x_train, y_train)
print(gscv.best_params_)



{'max_depth': None, 'max_leaf_nodes': None, 'n_estimators': 150}


In [7]:
y_pred_initial = rfc.predict(x_test)
y_pred = gscv.predict(x_test)

print("Initial accuracy: ",accuracy_score(y_test, y_pred_initial))
print("Final accuracy: ",accuracy_score(y_test, y_pred))

Initial accuracy:  0.7463726059199072
Final accuracy:  0.7626233313987232


## Trying this without columns with too many unique values

In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

df1 = pd.read_csv("final_data_removed_lowcounts.csv")
del df1['Unnamed: 0']
del df1['movieId']
df1

Unnamed: 0,count,avg_rating,director,genre,year,LeadActor,language
0,49695.0,3.921240,John Lasseter,Animation,1995,Tom Hanks,English
1,22243.0,3.211977,Joe Johnston,Adventure,1995,Robin Williams,English
2,12735.0,3.151040,Howard Deutch,Comedy,1995,Walter Matthau,English
3,2756.0,2.861393,Forest Whitaker,Comedy,1995,Whitney Houston,English
4,12161.0,3.064592,Charles Shyer,Comedy,1995,Steve Martin,English
...,...,...,...,...,...,...,...
17225,12.0,2.666667,Jaume Collet-Serra,Action,2015,Liam Neeson,English
17226,21.0,3.404762,Kenneth Branagh,Drama,2015,Cate Blanchett,English
17227,17.0,2.500000,Chris Buck,Animation,2015,Kristen Bell,English
17228,25.0,3.240000,Robert Schwentke,Action,2015,Kate Winslet,English


In [9]:
#Uncomment 1 by 1 to check, if there are way too many unique values try deleting that column
#df['director'].value_counts()
#df['genre'].value_counts()
#df['year'].value_counts()
#df['LeadActor'].value_counts()
#df['language'].value_counts()
del df1['director']
del df1['LeadActor']

In [10]:
#Can't do pd.get_dummies because the dataframe is too large --> maybe can with removed_lowcounts csv
X1 = df1.copy()
del X1['avg_rating']
#del X['count'] <--> NOT deleting because amount of times movies are rated has significance

X1 = pd.get_dummies(X1, columns=['year','genre','language'])
X1

Unnamed: 0,count,year_1894,year_1902,year_1903,year_1910,year_1914,year_1915,year_1916,year_1917,year_1918,...,language_Tibetan,language_Tswana,language_Turkish,language_Ukrainian,language_Ukrainian Sign Language,language_Urdu,language_Vietnamese,language_Welsh,language_Wolof,language_Zulu
0,49695.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,22243.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,12735.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2756.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,12161.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17225,12.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17226,21.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17227,17.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17228,25.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
Y1 = df1[['avg_rating']].copy()

#mapping ratings to create binary conditions for classifier
for i in range(0, len(Y1)):
    if Y1.loc[i, 'avg_rating'] < 3.5:
        Y1.loc[i, 'avg_rating'] = 0
    else:
        Y1.loc[i, 'avg_rating'] = 1

Y1

Unnamed: 0,avg_rating
0,1.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
17225,0.0
17226,0.0
17227,0.0
17228,0.0


In [12]:
x_train1, x_test1, y_train_intermediate1, y_test_intermediate1=train_test_split(X1, Y1,test_size=0.3,random_state=35)
y_train1 = np.ravel(y_train_intermediate1)
y_test1 = np.ravel(y_test_intermediate1)

del df1

In [13]:
rfc1 = RandomForestClassifier(random_state=35)
rfc1.fit(x_train1,y_train1)

#tree_params = {'n_estimators':[10,100,150],'max_leaf_nodes':[None,5,10,20],'max_depth':[None,10,20,50]}
gscv1 = GridSearchCV(rfc1, tree_params, cv=3)
gscv1.fit(x_train1, y_train1)
print(gscv1.best_params_)



{'max_depth': 20, 'max_leaf_nodes': None, 'n_estimators': 150}


In [14]:
y_pred_initial1 = rfc1.predict(x_test1)
y_pred1 = gscv1.predict(x_test1)

print("Initial accuracy: ",accuracy_score(y_test1, y_pred_initial1))
print("Final accuracy: ",accuracy_score(y_test1, y_pred1))

Initial accuracy:  0.7237376668601276
Final accuracy:  0.746566066937512


Removing the 'director' and 'LeadActor' columns did not increase accuracy

Simple Time Test

In [15]:
%%time
time_rfc = RandomForestClassifier(random_state=35).fit(x_train, y_train)
time_res = time_rfc.predict(x_test)



Wall time: 16.4 s


In [17]:
%%time
time_rfc1 = RandomForestClassifier(random_state=35).fit(x_train1, y_train1)
time_res1 = time_rfc1.predict(x_test1)



Wall time: 650 ms
