In [1]:
#Import dependencies
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
import re

In [2]:
#Read movies data csv created in ETL into dataframe
movies_df=pd.read_csv("data/updated_movies.csv")

## Sampling data for model creation

In [3]:
#Removing all movies for which genre is Unknown
samples_movie_df=movies_df[movies_df["genres"]!="UnKnown"]

In [4]:
samples_movie_df[samples_movie_df["genres"]=="UnKnown"]

Unnamed: 0,movieId,title,genres,avg_user_rating,year,comb


In [5]:
#Checking data after Unknown Genre is removed
samples_movie_df.head()

Unnamed: 0,movieId,title,genres,avg_user_rating,year,comb
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,3.893708,1995,Adventure Animation Children Comedy Fantasy 3....
1,2,Jumanji (1995),Adventure Children Fantasy,3.251527,1995,Adventure Children Fantasy 3.2515 1995
2,3,Grumpier Old Men (1995),Comedy Romance,3.142028,1995,Comedy Romance 3.142 1995
3,4,Waiting to Exhale (1995),Comedy Drama Romance,2.853547,1995,Comedy Drama Romance 2.8535 1995
4,5,Father of the Bride Part II (1995),Comedy,3.058434,1995,Comedy 3.0584 1995


In [6]:
#Sampling data furter as we are looking to create a prediction matrix only for good movies below using conduitions:
#1. Rating at least 4 and year of release greater than or equal to 1980
#2. Rating at least 3.2 and year of release greater than or equal to 1995
samples_movie_df=samples_movie_df[((samples_movie_df['avg_user_rating']>=4)& 
                                   (samples_movie_df['year']>=1980)) | 
                                  ((samples_movie_df['avg_user_rating']>=3.2) & (samples_movie_df['year']>=1995))]


In [7]:
#Resting index
samples_movie_df.reset_index(inplace=True, drop = True)

In [8]:
samples_movie_df['avg_user_rating']=samples_movie_df['avg_user_rating'].round(2)

In [9]:
#Check if movies are present multiple time
samples_movie_df[samples_movie_df.duplicated(['title'])]

Unnamed: 0,movieId,title,genres,avg_user_rating,year,comb
1702,26982,Men with Guns (1997),Drama,3.66,1997,Drama 3.6618 1997
3163,65665,Hamlet (2000),Drama,3.21,2000,Drama 3.2121 2000
7139,128862,Casanova (2005),Comedy Drama Romance,4.13,2005,Comedy Drama Romance 4.1316 2005
8768,144450,An Inspector Calls (2015),Action Comedy,3.5,2015,Action Comedy 3.5 2015
8806,144606,Confessions of a Dangerous Mind (2002),Comedy Crime Drama Romance Thriller,3.47,2002,Comedy Crime Drama Romance Thriller 3.4749 2002
9685,154943,Stranded (2015),Comedy,3.33,2015,Comedy 3.3333 2015
10749,164568,Interrogation (2016),Action Thriller,3.69,2016,Action Thriller 3.6875 2016
11407,168866,Free Fall (2014),Action Drama Thriller,4.5,2014,Action Drama Thriller 4.5 2014
11535,169530,Sing (2016),Drama,3.83,2016,Drama 3.8333 2016
13033,180029,Rose (2011),Action Animation Sci-Fi,3.33,2011,Action Animation Sci-Fi 3.3333 2011


In [10]:
#Dropping all duplicate records for a movie based on movie title
samples_movie_df.drop_duplicates(subset ="title", 
                     keep = False, inplace = True) 
  

In [11]:
#Check if movies are present multiple time after dropping duplicate records
samples_movie_df[samples_movie_df.duplicated(['title'])]

Unnamed: 0,movieId,title,genres,avg_user_rating,year,comb


In [12]:
#Resting index
samples_movie_df.reset_index(inplace=True, drop = True)

In [13]:
# Saving sample movies dataset to a csv file 
samples_movie_df.to_csv("data/sample_movies_gender_avg_prediction.csv")

## Checking data integrity

In [14]:
#Checking count of records
samples_movie_df.count()

movieId            16597
title              16597
genres             16597
avg_user_rating    16597
year               16597
comb               16597
dtype: int64

In [15]:
#checking count of data for each genre to understand distribution of data in each genres
samples_movie_df.groupby("genres").count()

Unnamed: 0_level_0,movieId,title,avg_user_rating,year,comb
genres,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Action,81,81,81,81,81
Action Adventure,15,15,15,15,15
Action Adventure Animation,12,12,12,12,12
Action Adventure Animation Children,3,3,3,3,3
Action Adventure Animation Children Comedy,6,6,6,6,6
...,...,...,...,...,...
Sci-Fi War,1,1,1,1,1
Sci-Fi Western,1,1,1,1,1
Thriller,241,241,241,241,241
War,12,12,12,12,12


## Create Feature column for predicting genre and rating together

In [16]:
#Combining genres and average user ratings together to create feature for building similarity matrix
samples_movie_df['feature']=samples_movie_df['genres']+' '+\
samples_movie_df['avg_user_rating'].map(str)

## Create Similarity matrix for recommendation

In [17]:
# Import sklearn libraries to create Count Vectorizer and cosine similarity matrix for prediction
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
#Creating a count vector for feature column so that it can be used to create a similarity matrix between movies
cv = CountVectorizer()
count_matrix = cv.fit_transform(samples_movie_df['feature'])
#print(count_matrix)

In [19]:
#creating a similarity score matrix for movies within our dataset
sim = cosine_similarity(count_matrix)
print(sim)
#print(movies_df['comb'])

[[1.         0.61237244 0.         ... 0.         0.23570226 0.28867513]
 [0.61237244 1.         0.         ... 0.         0.         0.        ]
 [0.         0.         1.         ... 0.         0.28867513 0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.23570226 0.         0.28867513 ... 0.         1.         0.40824829]
 [0.28867513 0.         0.         ... 0.         0.40824829 1.        ]]


### Movie Recommendation method

In [20]:
# Build a 1-dimensional array with movie titles
indices = pd.Series(samples_movie_df.index, index=samples_movie_df['title'])
print(indices)

title
Toy Story (1995)                                               0
Jumanji (1995)                                                 1
Heat (1995)                                                    2
Sabrina (1995)                                                 3
GoldenEye (1995)                                               4
                                                           ...  
Square Roots: The Story of SpongeBob SquarePants (2009)    16592
Destination Titan (2011)                                   16593
Last Days of the Arctic (2011)                             16594
Santosh Subramaniam (2008)                                 16595
Bad Poems (2018)                                           16596
Length: 16597, dtype: int64


In [21]:
# Function that get movie recommendations based on the cosine similarity score of movie genres and average rating combined
def genre_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(sim[idx]))
    #print(sim_scores)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[0:21] 
    #print(sim_scores)
    movie_indices = [i[0] for i in sim_scores]
    print(movie_indices)
    return samples_movie_df.iloc[movie_indices[1:11],[1,2,3]].values.tolist()

### Saving model for calling in flask app for movie recommendations on website

In [22]:
#Import pickle library
import pickle

In [23]:
#Exporting similariy scores into a .pickl file for use in front-end
pickle.dump(sim,open("genre_rating_predictor-similaritymatrix.pckl","wb"))

In [24]:
#Exporting dataframe into a .pickl file for use in front-end
pickle.dump(samples_movie_df,open("movie_df.pckl","wb"))

In [25]:
#Running model to predict score based on movie
str(genre_recommendations('The Godfather Family: A Look Inside (1990)'))

[14012, 11223, 12139, 8141, 8551, 1732, 2542, 3103, 5033, 5127, 5159, 6195, 6684, 6715, 6734, 6896, 7008, 7336, 7398, 7485, 7668]


'[[\'The Beckoning Silence (2007)\', \'Action Documentary Drama\', 3.5], [\'McLaren (2016)\', \'Action Documentary Drama\', 3.5], [\'Cartel Land (2015)\', \'Action Documentary Drama\', 3.61], [\'A Brokedown Melody (2004)\', \'Action Adventure Documentary Drama\', 3.5], [\'After the Rain (Ame agaru) (1999) \', \'Action Drama\', 3.8], [\'God Grew Tired of Us (2006)\', \'Documentary Drama\', 3.9], [\'Man Named Pearl, A (2006)\', \'Documentary\', 4.19], ["Rebellion (L\'ordre et la morale) (2011)", \'Action Drama\', 3.7], [\'Human Planet (2011)\', \'Documentary\', 4.19], [\'Ballplayer: Pelotero (2011)\', \'Documentary Drama\', 3.4]]'

## Second model considering only Genre and not rating

In [26]:
#Creating a count vector for feature column so that it can be used to create a similarity matrix between movies
cv = CountVectorizer()
count_matrix = cv.fit_transform(samples_movie_df['genres'])
#print(count_matrix)

In [27]:
#creating a similarity score matrix for movies within our dataset
sim = cosine_similarity(count_matrix)
print(sim)
#print(movies_df['comb'])

[[1.         0.77459667 0.         ... 0.         0.25819889 0.31622777]
 [0.77459667 1.         0.         ... 0.         0.         0.        ]
 [0.         0.         1.         ... 0.         0.33333333 0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.25819889 0.         0.33333333 ... 0.         1.         0.40824829]
 [0.31622777 0.         0.         ... 0.         0.40824829 1.        ]]


#### Saving matrices for prediction using only Genre

In [28]:
#creating a similarity score matrix for movies within our dataset
pickle.dump(sim,open("genrepredictor-similaritymatrix.pckl","wb"))

In [29]:
#Running model to predict score based on movie
str(genre_recommendations('The Godfather Family: A Look Inside (1990)'))

[14012, 11223, 12139, 8141, 8551, 1732, 2542, 3103, 5033, 5127, 5159, 6195, 6684, 6715, 6734, 6896, 7008, 7336, 7398, 7485, 7668]


'[[\'The Beckoning Silence (2007)\', \'Action Documentary Drama\', 3.5], [\'McLaren (2016)\', \'Action Documentary Drama\', 3.5], [\'Cartel Land (2015)\', \'Action Documentary Drama\', 3.61], [\'A Brokedown Melody (2004)\', \'Action Adventure Documentary Drama\', 3.5], [\'After the Rain (Ame agaru) (1999) \', \'Action Drama\', 3.8], [\'God Grew Tired of Us (2006)\', \'Documentary Drama\', 3.9], [\'Man Named Pearl, A (2006)\', \'Documentary\', 4.19], ["Rebellion (L\'ordre et la morale) (2011)", \'Action Drama\', 3.7], [\'Human Planet (2011)\', \'Documentary\', 4.19], [\'Ballplayer: Pelotero (2011)\', \'Documentary Drama\', 3.4]]'

In [30]:
sim[12139][11223]

1.0000000000000002