In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
ratings = pd.read_csv("Data/title_ratings_cleaned.gz", index_col = False, usecols = ["tconst", "averageRating"])
variables = pd.read_csv("Data/title_basics_for_ratings_analysis.gz", index_col = False, usecols = ["tconst", "runtimeMinutes", "isAdult"])

In [3]:
merged = pd.merge(ratings, variables, on = "tconst")
merged = merged.dropna()
merged = merged.reset_index(drop=True)
merged = merged[merged["runtimeMinutes"] < 300]
merged

Unnamed: 0,tconst,averageRating,isAdult,runtimeMinutes
0,tt0000009,5.9,0,45.0
1,tt0000147,5.2,0,20.0
2,tt0000502,3.8,0,100.0
3,tt0000574,6.1,0,70.0
4,tt0000679,5.2,0,120.0
...,...,...,...,...
224190,tt9914644,8.4,0,120.0
224191,tt9914942,6.8,0,74.0
224192,tt9916132,3.6,0,94.0
224193,tt9916160,6.6,0,72.0


In [4]:
genres = pd.read_csv("Data/title_basics_for_ratings_analysis.gz", index_col = False, usecols = ["tconst", "genres"])
genres = genres.dropna()
genres = genres.reset_index(drop = True)
genres["genres"] = genres["genres"].str.split(",")
genres = genres.explode("genres")
genres = genres.reset_index(drop = True)

#converting genres into into indictor variables
df = pd.get_dummies(genres["genres"])
df2 = pd.concat([genres, df], axis =1)
df2 = df2.drop(columns = "genres")
df2 = df2.groupby("tconst").sum()
df2 = df2.reset_index()
ready_data = pd.merge(merged, df2, on = "tconst")
ready_data

Unnamed: 0,tconst,averageRating,isAdult,runtimeMinutes,Action,Adult,Adventure,Animation,Biography,Comedy,...,Mystery,News,Reality-TV,Romance,Sci-Fi,Sport,Talk-Show,Thriller,War,Western
0,tt0000009,5.9,0,45.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,tt0000147,5.2,0,20.0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
2,tt0000574,6.1,0,70.0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,tt0000679,5.2,0,120.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,tt0001184,3.1,0,58.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218466,tt9914644,8.4,0,120.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
218467,tt9914942,6.8,0,74.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
218468,tt9916132,3.6,0,94.0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
218469,tt9916160,6.6,0,72.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
y= ready_data["averageRating"].values.astype("int")
y = y.round()
X = ready_data.drop(columns = ["averageRating", "tconst"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [6]:
#Length of movie broken down into 4 categories based on the quartiles of the data
df = pd.DataFrame(columns = ["runtimeMinutes"])
df["runtimeMinutes"] = ready_data["runtimeMinutes"]

df["< = 81 mins"] = df["runtimeMinutes"] <= 81
df["< = 81 mins"] = df["< = 81 mins"]*1

df["< = 91 mins"] = (df["runtimeMinutes"] <=91) & (df["runtimeMinutes"] > 81)
df["< = 91 mins"] = df["< = 91 mins"]*1

df["< = 102 mins"] = (df["runtimeMinutes"] <=102) & (df["runtimeMinutes"] > 91)
df["< = 102 mins"] = df["< = 102 mins"]*1

df["> 102 mins"] = (df["runtimeMinutes"] > 102)
df["> 102 mins"] = df["> 102 mins"]*1

categorized_data = pd.concat([ready_data, df], axis =1)
categorized_data = categorized_data.drop(
    columns=["tconst","runtimeMinutes"])

y2 = categorized_data["averageRating"].values.astype("int")
y2 = y2.round()
X2 = categorized_data.drop(columns = ["averageRating"], axis=1)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2,y2)

# categorized_data

In [7]:
bayes_model = make_pipeline(
    GaussianNB()
)
bayes_model.fit(X_train, y_train)
print(bayes_model.score(X_train, y_train))
print(bayes_model.score(X_test, y_test))

0.022392022117385706
0.022446812406166465


In [8]:
bayes_model2 = make_pipeline(
    GaussianNB()
)
bayes_model2.fit(X_train2, y_train2)
print(bayes_model2.score(X_train2, y_train2))
print(bayes_model2.score(X_test2, y_test2))

0.0101859593660171
0.009795305576916035


In [9]:
knn_model = make_pipeline(
    KNeighborsClassifier(n_neighbors=10)
)
knn_model.fit(X_train, y_train)
print(knn_model.score(X_train, y_train))
print(knn_model.score(X_test, y_test))

0.382635655129903
0.31912556300120837


In [10]:
from sklearn.tree import DecisionTreeClassifier
tree_model = make_pipeline(
    RandomForestClassifier(n_estimators=100,
        max_depth=15, min_samples_leaf=10)
)
tree_model.fit(X_train, y_train)
print(tree_model.score(X_train, y_train))
print(tree_model.score(X_test, y_test))

0.3823915338748757
0.3680471639386283


In [11]:
tree_model2 = make_pipeline(
    RandomForestClassifier(n_estimators=100,
        max_depth=15, min_samples_leaf=10)
)
tree_model2.fit(X_train2, y_train2)
print(tree_model2.score(X_train2, y_train2))
print(tree_model2.score(X_test2, y_test2))

0.3749824537847949
0.3719469771870079


In [12]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(solver='sgd',
    hidden_layer_sizes=(150,100,50), activation='logistic')
model.fit(X_train, y_train)
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

0.334867228552422
0.3304954410633857


In [13]:
model2 = MLPClassifier(solver='sgd',
    hidden_layer_sizes=(150,100,50), activation='logistic')
model2.fit(X_train2, y_train2)
print(model2.score(X_train2, y_train2))
print(model2.score(X_test2, y_test2))

0.33262741603754586
0.33721483759932624
