# Movie Recommendation System

In [60]:
import pandas as pd
import numpy as np
import sklearn

In [27]:
data = pd.read_csv('ml-100k/u.data', sep='\t', names=['user_id', 'item_id' , 'rating', 'timestamp'])
data.head(3)

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116


In [28]:
user = pd.read_csv('ml-100k/u.user', sep='|', names=['user_id', 'age',  'gender',  'occupation', 'zip_code'])
user.head(3)

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067


In [29]:
item = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1', names=['movie_id', 'movie_title', 'release_date', 'video_release_date',
              'IMDb_URL', 'unknown',  'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
              'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])
item.head(3)

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [22]:
item.columns

Index(['movie_id', 'movie_title', 'release_date', 'video_release_date',
       'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western'],
      dtype='object')

## EDA

- How many users/ movies are there ?
- R: Find the number of users rated any single movie
- v: Find the average rating of a movie
- C: Find the average overall rating

In [23]:
data.shape

(100000, 4)

In [25]:
data['item_id'].nunique()

1682

In [26]:
data['user_id'].nunique()

943

### finding V(avg) of item_id

In [34]:
v = data.groupby('item_id')['rating'].agg('mean')

In [36]:
v 

# V contains Means of rating of movies

item_id
1       3.878319
2       3.206107
3       3.033333
4       3.550239
5       3.302326
          ...   
1678    1.000000
1679    3.000000
1680    2.000000
1681    3.000000
1682    3.000000
Name: rating, Length: 1682, dtype: float64

### finding R

In [38]:
g = data.groupby('item_id')['rating']
R = g.count()

In [39]:
R

item_id
1       452
2       131
3        90
4       209
5        86
       ... 
1678      1
1679      1
1680      1
1681      1
1682      1
Name: rating, Length: 1682, dtype: int64

### Overall rating

In [41]:
C = data['rating'].mean()

In [44]:
m = 100

In [45]:
weighted_ratings = pd.DataFrame()
W = (R*v+ C*m)/(v+m)

weighted_ratings['item_id'] = W.index
weighted_ratings['W'] = W. values

weighted_ratings.head()

Unnamed: 0,item_id,W
0,1,20.273586
1,2,7.489731
2,3,6.075568
3,4,10.574442
4,5,6.166231


### from item_id we can extract the movien names

In [53]:
imdb = weighted_ratings.merge(item, left_on='item_id', right_on='movie_id')
imdb

Unnamed: 0,item_id,W,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,20.273586,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,7.489731,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,...,0,0,0,0,0,0,0,1,0,0
2,3,6.075568,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,10.574442,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,6.166231,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,3.504812,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,3.456175,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,3.480255,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,3.456175,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
imdb.sort_values(by='W', ascending=False)

Unnamed: 0,item_id,W,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
49,50,27.731198,50,Star Wars (1977),01-Jan-1977,,http://us.imdb.com/M/title-exact?Star%20Wars%2...,0,1,1,...,0,0,0,0,0,1,1,0,1,0
99,100,23.656799,100,Fargo (1996),14-Feb-1997,,http://us.imdb.com/M/title-exact?Fargo%20(1996),0,0,0,...,0,0,0,0,0,0,0,1,0,0
180,181,22.930818,181,Return of the Jedi (1983),14-Mar-1997,,http://us.imdb.com/M/title-exact?Return%20of%2...,0,1,1,...,0,0,0,0,0,1,1,0,1,0
257,258,22.051137,258,Contact (1997),11-Jul-1997,,http://us.imdb.com/Title?Contact+(1997/I),0,0,0,...,0,0,0,0,0,0,1,0,0,0
173,174,20.517383,174,Raiders of the Lost Ark (1981),01-Jan-1981,,http://us.imdb.com/M/title-exact?Raiders%20of%...,0,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1652,1653,3.409390,1653,Entertaining Angels: The Dorothy Day Story (1996),27-Sep-1996,,http://us.imdb.com/M/title-exact?Entertaining%...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
813,814,3.409390,814,"Great Day in Harlem, A (1994)",01-Jan-1994,,http://us.imdb.com/M/title-exact?Great%20Day%2...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1200,1201,3.409390,1201,Marlene Dietrich: Shadow and Light (1996),02-Apr-1996,,http://us.imdb.com/M/title-exact?Marlene%20Die...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1121,1122,3.409390,1122,They Made Me a Criminal (1939),01-Jan-1939,,http://us.imdb.com/M/title-exact?They%20Made%2...,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Content Based Filtering

### Co-sine Similarity

In [56]:
v1 = np.array([[1,2,3]])
v2 = np.array([[0,1,2]])
v3 = np.array([[-1,-2,-3]])

In [62]:
# Cosine similarity

cosine_simil = (np.dot(v1,v2.T))/(np.sqrt(np.sum(v1**2))*np.sqrt(np.sum(v2**2)))
cosine_simil

array([[0.95618289]])

In [63]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(v1,v2)

array([[0.95618289]])

### Applying Cosine similarity on Movie Genre

In [66]:
movie_content = item.loc[:,"unknown":]
movie_content.head(3)

Unnamed: 0,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [69]:
sim = pd.DataFrame(cosine_similarity(movie_content, movie_content))

In [70]:
#provide some column index 
#using this column index we have to filter the dataframe
#sort the values
#take the top k similar movies

In [88]:
column_idx = 419 #98
k = 10

indices = sim.iloc[:,column_idx].drop(column_idx).sort_values(ascending=False)[:k].index
print(item.iloc[column_idx]['movie_title'])
item.iloc[indices]

Alice in Wonderland (1951)


Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
623,624,"Three Caballeros, The (1945)",01-Jan-1945,,http://us.imdb.com/M/title-exact?Three%20Cabal...,0,0,0,1,1,...,0,0,0,1,0,0,0,0,0,0
98,99,Snow White and the Seven Dwarfs (1937),01-Jan-1937,,http://us.imdb.com/M/title-exact?Snow%20White%...,0,0,0,1,1,...,0,0,0,1,0,0,0,0,0,0
417,418,Cinderella (1950),01-Jan-1950,,http://us.imdb.com/M/title-exact?Cinderella%20...,0,0,0,1,1,...,0,0,0,1,0,0,0,0,0,0
595,596,"Hunchback of Notre Dame, The (1996)",21-Jun-1996,,http://us.imdb.com/M/title-exact?Hunchback%20o...,0,0,0,1,1,...,0,0,0,1,0,0,0,0,0,0
587,588,Beauty and the Beast (1991),01-Jan-1991,,http://us.imdb.com/M/title-exact?Beauty%20and%...,0,0,0,1,1,...,0,0,0,1,0,0,0,0,0,0
70,71,"Lion King, The (1994)",01-Jan-1994,,"http://us.imdb.com/M/title-exact?Lion%20King,%...",0,0,0,1,1,...,0,0,0,1,0,0,0,0,0,0
431,432,Fantasia (1940),01-Jan-1940,,http://us.imdb.com/M/title-exact?Fantasia%20(1...,0,0,0,1,1,...,0,0,0,1,0,0,0,0,0,0
472,473,James and the Giant Peach (1996),12-Apr-1996,,http://us.imdb.com/M/title-exact?James%20and%2...,0,0,0,1,1,...,0,0,0,1,0,0,0,0,0,0
102,103,All Dogs Go to Heaven 2 (1996),29-Mar-1996,,http://us.imdb.com/M/title-exact?All%20Dogs%20...,0,0,0,1,1,...,0,0,0,1,0,0,0,0,0,0
988,989,Cats Don't Dance (1997),26-Mar-1997,,http://us.imdb.com/M/title-exact?Cats%20Don%27...,0,0,0,1,1,...,0,0,0,1,0,0,0,0,0,0


# Item Collabrating Filtering

In [97]:
data = pd.read_csv('ml-100k/u.data', sep='\t', names=['user_id', 'item_id' , 'rating', 'timestamp'])
data.head(3)

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116


In [99]:
# Using pivot to create item/user matrix

data.pivot(index='item_id',columns='user_id', values='rating', fill_value=0)

TypeError: DataFrame.pivot() got an unexpected keyword argument 'fill_value'