In [1]:
#Import dependencies
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
import re

In [2]:
#Read movies data csv created in ETL into dataframe
movies_df=pd.read_csv("data/updated_movies.csv")

## Sampling data for model creation

In [3]:
#Removing all movies for which genre is Unknown
samples_movie_df=movies_df[movies_df["genres"]!="Unknown"]

In [4]:
#Checking data after Unknown Genre is removed
samples_movie_df.head()

Unnamed: 0,movieId,title,genres,avg_user_rating,year,comb
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,3.893708,1995,Adventure Animation Children Comedy Fantasy 3....
1,2,Jumanji (1995),Adventure Children Fantasy,3.251527,1995,Adventure Children Fantasy 3.2515 1995
2,3,Grumpier Old Men (1995),Comedy Romance,3.142028,1995,Comedy Romance 3.142 1995
3,4,Waiting to Exhale (1995),Comedy Drama Romance,2.853547,1995,Comedy Drama Romance 2.8535 1995
4,5,Father of the Bride Part II (1995),Comedy,3.058434,1995,Comedy 3.0584 1995


In [5]:
#Sampling data furter as we are looking to create a prediction matrix only for good movies below using conduitions:
#1. Rating at least 4 and year of release greater than or equal to 1980
#2. Rating at least 3.2 and year of release greater than or equal to 1995
samples_movie_df=samples_movie_df[((samples_movie_df['avg_user_rating']>=4)& 
                                   (samples_movie_df['year']>=1980)) | 
                                  ((samples_movie_df['avg_user_rating']>=3.2) & (samples_movie_df['year']>=1995))]


In [6]:
#Resting index
samples_movie_df.reset_index(inplace=True, drop = True)

In [7]:
samples_movie_df['avg_user_rating']=samples_movie_df['avg_user_rating'].round(2)

In [8]:
# Saving sample movies dataset to a csv file 
samples_movie_df.to_csv("data/sample_movies_gender_avg_prediction.csv")

## Checking data integrity

In [9]:
#Checking count of records
samples_movie_df.count()

movieId            17905
title              17905
genres             17905
avg_user_rating    17905
year               17905
comb               17905
dtype: int64

In [10]:
#checking count of data for each genre to understand distribution of data in each genres
samples_movie_df.groupby("genres").count()

Unnamed: 0_level_0,movieId,title,avg_user_rating,year,comb
genres,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Action,81,81,81,81,81
Action Adventure,15,15,15,15,15
Action Adventure Animation,12,12,12,12,12
Action Adventure Animation Children,3,3,3,3,3
Action Adventure Animation Children Comedy,6,6,6,6,6
...,...,...,...,...,...
Sci-Fi Western,1,1,1,1,1
Thriller,241,241,241,241,241
UnKnown,1276,1276,1276,1276,1276
War,12,12,12,12,12


## Create Feature column

In [11]:
#Combining genres and average user ratings together to create feature for building similarity matrix
samples_movie_df['feature']=samples_movie_df['genres']+' '+\
samples_movie_df['avg_user_rating'].map(str)

## Create Similarity matrix for recommendation

In [12]:
# Import sklearn libraries to create Count Vectorizer and cosine similarity matrix for prediction
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
#Creating a count vector for feature column so that it can be used to create a similarity matrix between movies
cv = CountVectorizer()
count_matrix = cv.fit_transform(samples_movie_df['feature'])
#print(count_matrix)

In [14]:
#creating a similarity score matrix for movies within our dataset
sim = cosine_similarity(count_matrix)
print(sim)
#print(movies_df['comb'])

[[1.         0.61237244 0.         ... 0.         0.23570226 0.28867513]
 [0.61237244 1.         0.         ... 0.         0.         0.        ]
 [0.         0.         1.         ... 0.         0.28867513 0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.23570226 0.         0.28867513 ... 0.         1.         0.40824829]
 [0.28867513 0.         0.         ... 0.         0.40824829 1.        ]]


### Movie Recommendation method

In [15]:
# Build a 1-dimensional array with movie titles
indices = pd.Series(samples_movie_df.index, index=samples_movie_df['title'])
print(indices)

title
Toy Story (1995)                      0
Jumanji (1995)                        1
Heat (1995)                           2
Sabrina (1995)                        3
GoldenEye (1995)                      4
                                  ...  
Destination Titan (2011)          17900
Last Days of the Arctic (2011)    17901
Mao Zedong 1949 (2019)            17902
Santosh Subramaniam (2008)        17903
Bad Poems (2018)                  17904
Length: 17905, dtype: int64


In [16]:
# Function that get movie recommendations based on the cosine similarity score of movie genres and average rating combined
def genre_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(sim[idx]))
    #print(sim_scores)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[0:21] 
    #print(sim_scores)
    movie_indices = [i[0] for i in sim_scores]
    print(movie_indices)
    return samples_movie_df.iloc[movie_indices[1:11],[1,2,3]].values.tolist()

### Saving model for calling in flask app for movie recommendations on website

In [17]:
#Import pickle library
import pickle

In [18]:
#Exporting similariy scores into a .pickl file for use in front-end
pickle.dump(sim,open("genrepredictor-similaritymatrix.pckl","wb"))

In [19]:
#Exporting dataframe into a .pickl file for use in front-end
pickle.dump(samples_movie_df,open("movie_df.pckl","wb"))

In [20]:
#Running model to predict score based on movie
str(genre_recommendations('Titanic (1997)'))

[437, 666, 1144, 1183, 1759, 4184, 4406, 5762, 7377, 8775, 8862, 9206, 12192, 13034, 13845, 14132, 16775, 2487, 2688, 4013, 4836]


'[[\'Oscar and Lucinda (a.k.a. Oscar & Lucinda) (1997)\', \'Drama Romance\', 3.38], [\'Moonlight Mile (2002)\', \'Drama Romance\', 3.38], [\'Crime of Father Amaro, The (Crimen del padre Amaro, El) (2002)\', \'Drama Romance\', 3.38], [\'Suzhou River (Suzhou he) (2000)\', \'Drama Romance\', 3.38], [\'Water for Elephants (2011)\', \'Drama Romance\', 3.38], ["Someone I Loved (Je l\'aimais) (2009)", \'Drama Romance\', 3.38], [\'Saint Laurent (2014)\', \'Drama Romance\', 3.38], [\'A Little Chaos (2014)\', \'Drama Romance\', 3.38], [\'Lootera (2013)\', \'Drama Romance\', 3.38], [\'Freeheld (2015)\', \'Drama Romance\', 3.38]]'