## Importing neccesary libraries

In [1]:
import numpy as np
import pandas as pd
from math import sqrt
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
movies_df = pd.read_csv('ml-latest/movies.csv')
ratings_df = pd.read_csv('ml-latest/ratings.csv')

In [4]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Store year of movie into new column name as year

In [6]:
# For specifying the parenthesis
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))', expand=False) 

# Removing the parenthesis
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)', expand=False) 

# Removing year from the title column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')

# Apply Strip() to make sure every character end with white spaces
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

movies_df.head()

  movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')


Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [9]:
# converting genre into list
movies_df['genres'] = movies_df.genres.str.split('|')
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [10]:
movieWithGenres_df = movies_df.copy()

In [11]:
for index, row in movies_df.iterrows():
    for genre in row['genres']:
        movieWithGenres_df.at[index, genre] = 1

# Filling 0 in the place of NaN values.
movieWithGenres_df = movieWithGenres_df.fillna(0)
movieWithGenres_df.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Ratings

In [12]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [13]:
# Removing unnecessary Column
ratings_df = ratings_df.drop('timestamp', 1)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


# Recommendation System

In [14]:
userInput = [
        {'title':'Carrington', 'rating':4},
        {'title':'Heat', 'rating':5},
        {'title':'Nixon', 'rating':4.5},
        {'title':'Powder', 'rating':3},
        {'title':'Screamers', 'rating':5},
        {'title':"Things to Do in Denver When You're Dead", 'rating':4.5},
        {'title':'Dunston Checks In', 'rating':3.5},
        {'title':'Catwalk', 'rating':2},
    
    ]
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,Carrington,4.0
1,Heat,5.0
2,Nixon,4.5
3,Powder,3.0
4,Screamers,5.0
5,Things to Do in Denver When You're Dead,4.5
6,Dunston Checks In,3.5
7,Catwalk,2.0


In [16]:
# Filtering out the movies by title
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]

# Merging after then we will get movieId.
inputMovies = pd.merge(inputId, inputMovies)

# Removing unneccessary column
inputMovies = inputMovies.drop('genres', 1).drop('year', 1)

inputMovies

Unnamed: 0,movieId,title,rating
0,6,Heat,5.0
1,73608,Heat,5.0
2,131274,Heat,5.0
3,14,Nixon,4.5
4,24,Powder,3.0
5,35,Carrington,4.0
6,76,Screamers,5.0
7,119832,Screamers,5.0
8,81,Things to Do in Denver When You're Dead,4.5
9,87,Dunston Checks In,3.5


In [18]:
# Filtering out the movies from user input
userMovies = movieWithGenres_df[movieWithGenres_df['movieId'].isin(inputMovies['movieId'].tolist())]
userMovies

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
5,6,Heat,"[Action, Crime, Thriller]",1995,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,14,Nixon,[Drama],1995,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23,24,Powder,"[Drama, Sci-Fi]",1995,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34,35,Carrington,"[Drama, Romance]",1995,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75,76,Screamers,"[Action, Sci-Fi, Thriller]",1995,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80,81,Things to Do in Denver When You're Dead,"[Crime, Drama, Romance]",1995,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86,87,Dunston Checks In,"[Children, Comedy]",1996,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
106,108,Catwalk,[Documentary],1996,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
14752,73608,Heat,"[Comedy, Drama, Romance]",1972,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25729,119832,Screamers,"[Action, Horror]",1979,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# Resetting the index to avoid future issues
userMovies = userMovies.reset_index(drop=True)

# Dropping unnecessary attributes
userGenreTable = userMovies.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)

userGenreTable

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
inputMovies['rating']

0     5.0
1     5.0
2     5.0
3     4.5
4     3.0
5     4.0
6     5.0
7     5.0
8     4.5
9     3.5
10    2.0
Name: rating, dtype: float64

In [21]:
# Now applying dot product to get weights, the maximum weight will be recommends to the user
userProfile = userGenreTable.transpose().dot(inputMovies['rating'])

userProfile

Adventure              0.0
Animation              0.0
Children               5.0
Comedy                 9.5
Fantasy                0.0
Romance               13.0
Drama                 25.0
Action                13.5
Crime                  9.0
Thriller              10.0
Horror                 3.5
Mystery                0.0
Sci-Fi                 8.0
IMAX                   0.0
Documentary            5.0
War                    0.0
Musical                0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

In [23]:
# gets the genre of every movies from out main movie dataset
genreTable = movieWithGenres_df.set_index(movieWithGenres_df['movieId'])

genreTable = genreTable.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)

genreTable.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
genreTable.shape

(34208, 20)

In [25]:
#  Multiplying the genre by the weights and take weightage average
recommendationTable_df = ((genreTable*userProfile).sum(axis=1))/(userProfile.sum())
recommendationTable_df.head()

movieId
1    0.142857
2    0.049261
3    0.221675
4    0.467980
5    0.093596
dtype: float64

In [26]:
# Sorting ours recommendations into decreasing order
recommendationTable_df = recommendationTable_df.sort_values(ascending=False)
recommendationTable_df.head()

movieId
127341    0.788177
4719      0.788177
76153     0.788177
75408     0.788177
150268    0.738916
dtype: float64

## This is final recommendation *movies*

In [27]:

movies_df.loc[movies_df['movieId'].isin(recommendationTable_df.head(30).keys())]

Unnamed: 0,movieId,title,genres,year
19,20,Money Train,"[Action, Comedy, Crime, Drama, Thriller]",1995
143,145,Bad Boys,"[Action, Comedy, Crime, Drama, Thriller]",1995
455,459,"Getaway, The","[Action, Adventure, Crime, Drama, Romance, Thr...",1994
1398,1432,Metro,"[Action, Comedy, Crime, Drama, Thriller]",1997
4625,4719,Osmosis Jones,"[Action, Animation, Comedy, Crime, Drama, Roma...",2001
4774,4869,Burnt Money (Plata Quemada),"[Action, Crime, Drama, Romance, Thriller]",2000
4861,4956,"Stunt Man, The","[Action, Adventure, Comedy, Drama, Romance, Th...",1980
4932,5027,Another 48 Hrs.,"[Action, Comedy, Crime, Drama, Thriller]",1990
5530,5628,Wasabi,"[Action, Comedy, Crime, Drama, Thriller]",2001
7124,7235,Ichi the Killer (Koroshiya 1),"[Action, Comedy, Crime, Drama, Horror, Thriller]",2001
