In [1]:
import mysql.connector
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt

#needed to check effectiveness of various algorithms
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold #using KFold instead of StratifiedKFold bc some groups in "titles" too small

#needed to check stats of algorithm
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

#ML algorithms
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB 

#doing one hot encoding
from sklearn.preprocessing import OneHotEncoder

In [2]:
#connect with MySQL
dp = mysql.connector.connect (
    host="localhost",
    user="root",
    password="AnA338phoriA?",
    db="movielensset"
)

In [3]:
#what moviereviews table from MySQL looks like before changing anything
moviesset = pd.read_sql_query("Select * from moviesset", dp)
moviesset

Unnamed: 0,movieId,titles,genres,userId,ratings,time_ratings,tag,time_tags,age,gender,job
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy\r,1.0,4.0,9.649827e+08,,,24.0,M,technician
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy\r,5.0,4.0,8.474350e+08,,,33.0,F,other
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy\r,7.0,5.0,1.106636e+09,,,57.0,M,administrator
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy\r,15.0,3.0,1.510578e+09,,,49.0,F,educator
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy\r,17.0,5.0,1.305696e+09,,,30.0,M,programmer
...,...,...,...,...,...,...,...,...,...,...,...
102690,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy\r,184.0,4.0,1.537109e+09,,,37.0,M,librarian
102691,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy\r,184.0,4.0,1.537110e+09,,,37.0,M,librarian
102692,193585,Flint (2017),Drama\r,184.0,4.0,1.537110e+09,,,37.0,M,librarian
102693,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation\r,184.0,4.0,1.537110e+09,,,37.0,M,librarian


In [4]:
#table where all of the null "tag" and "time_tag" rows are gone
#removed rows w/ null values cut more than 7000 rows. really sketchy move. in pro setting, best not to do this 
moviessetclean = pd.read_sql_query("select movieId, titles, genres, ratings, tag, age, gender, job from moviesset where tag is not null", dp)
moviessetclean

Unnamed: 0,movieId,titles,genres,ratings,tag,age,gender,job
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy\r,4,pixar,23,M,salesman
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy\r,4,pixar,51,M,executive
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy\r,4,fun,24,M,entertainment
3,2,Jumanji (1995),Adventure|Children|Fantasy\r,4,fantasy,27,F,administrator
4,2,Jumanji (1995),Adventure|Children|Fantasy\r,4,magic board game,27,F,administrator
...,...,...,...,...,...,...,...,...
3471,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi\r,4,star wars,27,F,administrator
3472,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi\r,4,anime,37,M,librarian
3473,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi\r,4,comedy,37,M,librarian
3474,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi\r,4,gintama,37,M,librarian


In [5]:
#checking unique movies; repeats from multiple tags and ratings for one movie
print("Unique movies for uncleaned set: " + str(moviesset["titles"].unique().size))
print("Unique movies for cleaned set: " + str(moviessetclean["titles"].unique().size))

Unique movies for uncleaned set: 9737
Unique movies for cleaned set: 1464


In [6]:
#getting rid of '\r' from the end of every element in genres column
moviessetclean['genres'] = moviessetclean['genres'].str.rstrip('\r') #rstrip vs. strip: remove from end vs. beginning
moviessetclean['gender'] = moviessetclean['gender'].str.strip(' ')
moviessetclean['job'] = moviessetclean['job'].str.strip(' ')

#cleaning capitalization
moviessetclean['genres'] = moviessetclean['genres'].str.lower()
moviessetclean['tag'] = moviessetclean['tag'].str.lower()
moviessetclean['gender'] = moviessetclean['gender'].str.lower()

In [7]:
#can simply identify movie with its title, so no need for "movieId"
moviessetclean = moviessetclean.drop(columns='movieId')

In [8]:
moviessetclean['genres'] = moviessetclean['genres'].str.split(pat='|', expand=False).tolist()
moviessetclean['tag'] = moviessetclean['tag'].str.split(pat=' ', expand=False).tolist()

In [9]:
#exploding the lists in each row of both columns
moviessetclean = moviessetclean.explode('tag')
moviessetclean = moviessetclean.explode('genres')

In [10]:
#one hot encoder
enc = OneHotEncoder(handle_unknown='ignore')

In [11]:
set_encoded = enc.fit_transform(moviessetclean.iloc[:,1:]).toarray()

In [12]:
set_encoded

array([[0., 0., 1., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [13]:
#splitting data into training and testing
X = set_encoded
Y = moviessetclean.values[:,0]

In [14]:
#kept getting error: least populated class with size 1
# I had to adjust/play around with test_size
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size = 0.33, random_state = 42)

In [15]:
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('KNN', KNeighborsClassifier()))
models.append(('RAND', RandomForestClassifier()))
models.append(('NB', MultinomialNB()))
models.append(('GNB', GaussianNB()))
models.append(('CART', DecisionTreeClassifier()))


results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=2, random_state=1, shuffle=True)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))

LR: 0.581877 (0.000834)
KNN: 0.473249 (0.004353)
RAND: 0.595835 (0.001222)
NB: 0.332864 (0.009341)
GNB: 0.670057 (0.004797)
CART: 0.547146 (0.003000)


In [16]:
final_model = GaussianNB()
final_model.fit(X_train, Y_train)
predictions = final_model.predict(X_validation)


# Evaluate predictions
print(accuracy_score(Y_validation, predictions))

0.7406919275123558


In [17]:
#predictions based on user input

u_age = int(input("Enter your age: "))
u_gender = input("Enter your gender: ")
u_job = input("Enter your profession: ")
u_genre = input("Enter preferred genre: ")
u_rating = int(input("Enter preferred rating: "))
u_tag = input("Enter preferred key word: ")

Enter your age: 17
Enter your gender: f
Enter your profession: student
Enter preferred genre: musical
Enter preferred rating: 4
Enter preferred key word: fun


In [18]:
user_prediction = enc.transform([[u_genre, u_rating, u_tag, u_age, u_gender, u_job]]).toarray()

movie_rec = final_model.predict(user_prediction)[0]

print(movie_rec)

Pulp Fiction (1994)
