## Model Based Collaborative Filtering System

In [1]:
import numpy as np
import pandas as pd

In [2]:
import sklearn
from sklearn.decomposition import TruncatedSVD

The MovieLens dataset was collected by the GroupLens Research Project at the University of Minnesota. You can download the dataset for this demostration at the following URL: https://grouplens.org/datasets/movielens/100k/

**Data Collection and Preparation** 

In [20]:
#get items and rating data
columns = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/u.data', sep='\t', names=columns)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [21]:
#get attribute of items
columns = ['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
          'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
          'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv('ml-100k/u.item', sep='|', names=columns, encoding='latin-1')
movies.head()

Unnamed: 0,item_id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [22]:
#get movie titles and item_id columns
movie_names = movies[['item_id', 'movie title']]
movie_names.head()

Unnamed: 0,item_id,movie title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [35]:
#check total number of movies in the dataset
print(len(movie_names['movie title'].unique()))

1664


In [24]:
combined = pd.merge(left = df, right = movie_names, on='item_id')
combined.tail()

Unnamed: 0,user_id,item_id,rating,timestamp,movie title
99995,840,1674,4,891211682,Mamma Roma (1962)
99996,655,1640,3,888474646,"Eighth Day, The (1996)"
99997,655,1637,3,888984255,Girls Town (1996)
99998,655,1630,3,887428735,"Silence of the Palace, The (Saimt el Qusur) (1..."
99999,655,1641,3,887427810,Dadetown (1995)


**Group Data**

In [26]:
#group data to find item with the highest rating count
combined.groupby(by = 'item_id')['rating'].count().sort_values(ascending= False).head()

item_id
50     583
258    509
100    508
181    507
294    485
Name: rating, dtype: int64

In [31]:
#check names of the top five movies with the highest rating count 
print(combined[combined['item_id']==50]['movie title'].unique())
print(combined[combined['item_id']==258]['movie title'].unique())
print(combined[combined['item_id']==100]['movie title'].unique())
print(combined[combined['item_id']==181]['movie title'].unique())
print(combined[combined['item_id']==294]['movie title'].unique())

['Star Wars (1977)']
['Contact (1997)']
['Fargo (1996)']
['Return of the Jedi (1983)']
['Liar Liar (1997)']


**Build Utility Matrix using Crosstabulation**

In [38]:
rating_crosstab = combined.pivot_table(values='rating', index='user_id', columns= 'movie title', fill_value=0)
rating_crosstab.head()

movie title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,2,5,0,0,3,4,0,0,...,0,0,0,5,3,0,0,0,4,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,2,0,0,0,0,4,0,0,...,0,0,0,4,0,0,0,0,4,0


In [39]:
#check shape of data
rating_crosstab.shape

(943, 1664)

**Transpose Dataframe**

In [42]:
df_transposed = rating_crosstab.values.T
df_transposed.shape

(1664, 943)

**Decompose Matrix**

In [44]:
svd = TruncatedSVD(n_components=12, random_state=10, n_iter=10)
decomposed_matrix = svd.fit_transform(df_transposed)
decomposed_matrix.shape

(1664, 12)

**Generate Correlation Matrix**

In [46]:
corr_mat = np.corrcoef(decomposed_matrix)
corr_mat.shape

(1664, 1664)

**Reccomend Movies Closely Related to Star Wars**

In [56]:
#find index of star wars movie
movie_names = rating_crosstab.columns
movies_list = list(movie_names)

star_wars = movies_list.index('Star Wars (1977)')
star_wars

1398

In [57]:
corr_star_wars = corr_mat[1398]
corr_star_wars.shape

(1664,)

In [59]:
#list movies that have a correlation of above 0.9 and below 1.0
list(movie_names[(corr_star_wars<1.0) & (corr_star_wars > 0.9)])

['Die Hard (1988)',
 'Empire Strikes Back, The (1980)',
 'Fugitive, The (1993)',
 'Raiders of the Lost Ark (1981)',
 'Return of the Jedi (1983)',
 'Terminator 2: Judgment Day (1991)',
 'Terminator, The (1984)',
 'Toy Story (1995)']