The MovieLens dataset was collected by the GroupLens Research Project at the University of Minnesota.

In [2]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.decomposition import TruncatedSVD

In [3]:
df = pd.read_csv("ratings.csv")
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [4]:
len(df)

25000095

In [5]:
df.dropna()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [6]:
a = pd.read_csv("stats_imp.csv")
a

Unnamed: 0,movieId,title,ratings_mean,ratings_count,genres
0,2072,"'burbs, The (1989)",3.114910,3320,Comedy
1,69757,(500) Days of Summer (2009),3.725344,11498,Comedy|Drama|Romance
2,8169,*batteries not included (1987),3.244816,1736,Children|Comedy|Fantasy|Sci-Fi
3,3420,...And Justice for All (1979),3.639286,1120,Drama|Thriller
4,152077,10 Cloverfield Lane (2016),3.714617,3660,Thriller
...,...,...,...,...,...
3794,57274,[REC] (2007),3.703164,1991,Drama|Horror|Thriller
3795,2600,eXistenZ (1999),3.349642,5580,Action|Sci-Fi|Thriller
3796,5507,xXx (2002),2.786182,5956,Action|Crime|Thriller
3797,33158,xXx: State of the Union (2005),2.337158,1133,Action|Crime|Thriller


In [7]:
movies = a[['movieId', 'title']]
movies

Unnamed: 0,movieId,title
0,2072,"'burbs, The (1989)"
1,69757,(500) Days of Summer (2009)
2,8169,*batteries not included (1987)
3,3420,...And Justice for All (1979)
4,152077,10 Cloverfield Lane (2016)
...,...,...
3794,57274,[REC] (2007)
3795,2600,eXistenZ (1999)
3796,5507,xXx (2002)
3797,33158,xXx: State of the Union (2005)


In [8]:
# movies = pd.read_csv("movies.csv")
# movies.head()

In [9]:
len(set(movies.movieId))

3799

In [10]:
combined_movies_data = pd.merge(df, movies, on = 'movieId')
combined_movies_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,296,5.0,1147880044,Pulp Fiction (1994)
1,3,296,5.0,1439474476,Pulp Fiction (1994)
2,4,296,4.0,1573938898,Pulp Fiction (1994)
3,5,296,4.0,830786155,Pulp Fiction (1994)
4,7,296,4.0,835444730,Pulp Fiction (1994)


In [11]:
len(combined_movies_data)

22138587

In [12]:
combined_movies_data.dropna()

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,296,5.0,1147880044,Pulp Fiction (1994)
1,3,296,5.0,1439474476,Pulp Fiction (1994)
2,4,296,4.0,1573938898,Pulp Fiction (1994)
3,5,296,4.0,830786155,Pulp Fiction (1994)
4,7,296,4.0,835444730,Pulp Fiction (1994)
...,...,...,...,...,...
22138582,130233,192003,3.5,1533100072,Journey to the Center of the Earth (2008)
22138583,32063,124757,3.0,1487482173,Hostage (2005)
22138584,72315,124757,3.0,1535602355,Hostage (2005)
22138585,111664,124757,4.0,1514502635,Hostage (2005)


In [13]:
combined_movies_data.groupby("movieId")["rating"].count().sort_values(ascending= False).head(10)

movieId
356     81491
318     81482
296     79672
593     74127
2571    72674
260     68717
480     64144
527     60411
110     59184
2959    58773
Name: rating, dtype: int64

## Building a Utility Matrix

In [14]:
rating_crosstab = combined_movies_data.pivot_table(values = 'rating', index = 'userId', columns = 'title', fill_value=0)

In [18]:
rating_crosstab.head()

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),...,Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),Zulu (1964),[REC] (2007),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,4.0,3.5,0.0,0.0,0.0,0.0,3.5,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Transposing the Matrix

In [19]:
rating_crosstab.shape

(162539, 3790)

In [20]:
X = rating_crosstab.values.T
X.shape

(3790, 162539)

## Decomposing the Matrix

A well-known matrix factorization method is Singular value decomposition (SVD). Collaborative Filtering can be formulated by approximating a matrix X by using singular value decomposition.

In [21]:
SVD = TruncatedSVD(n_components=12, random_state=33)

resultatnt_matrix = SVD.fit_transform(X)
resultatnt_matrix.shape

(3790, 12)

## Generating a Correlation Matrix

In [22]:
corr_mat = np.corrcoef(resultatnt_matrix)
corr_mat.shape

(3790, 3790)

## Isolating Star Wars from the Correlation Matrix

In [39]:
movies_names = rating_crosstab.columns 
movies_list = list(movies_names)
movies_list

["'burbs, The (1989)",
 '(500) Days of Summer (2009)',
 '*batteries not included (1987)',
 '...And Justice for All (1979)',
 '10 Cloverfield Lane (2016)',
 '10 Things I Hate About You (1999)',
 '10,000 BC (2008)',
 '101 Dalmatians (1996)',
 '101 Dalmatians (One Hundred and One Dalmatians) (1961)',
 '102 Dalmatians (2000)',
 '12 Angry Men (1957)',
 '12 Years a Slave (2013)',
 '127 Hours (2010)',
 '13 Going on 30 (2004)',
 '13th Warrior, The (1999)',
 '1408 (2007)',
 '15 Minutes (2001)',
 '16 Blocks (2006)',
 '17 Again (2009)',
 '1984 (Nineteen Eighty-Four) (1984)',
 '2 Days in the Valley (1996)',
 '2 Fast 2 Furious (Fast and the Furious 2, The) (2003)',
 '2 Guns (2013)',
 '20,000 Leagues Under the Sea (1954)',
 '200 Cigarettes (1999)',
 '2001: A Space Odyssey (1968)',
 '2010: The Year We Make Contact (1984)',
 '2012 (2009)',
 '2046 (2004)',
 '21 (2008)',
 '21 Grams (2003)',
 '21 Jump Street (2012)',
 '22 Jump Street (2014)',
 '24 Hour Party People (2002)',
 '25th Hour (2002)',
 '27 Dres

In [24]:
name = str(input())

Batman v Superman: Dawn of Justice (2016)


In [25]:
a = movies_list.index(name)
print(a)

316


In [29]:
corr_a = corr_mat[a]
corr_a.shape

(3790,)

## Recommending a Highly Correlated Movie 

In [38]:
b = pd.DataFrame(movies_names[(corr_a < 1.0) & (corr_a >0.95)][:10])
b

Unnamed: 0,title
0,22 Jump Street (2014)
1,300: Rise of an Empire (2014)
2,A Million Ways to Die in the West (2014)
3,Abraham Lincoln: Vampire Hunter (2012)
4,"Adventures of Tintin, The (2011)"
5,After Earth (2013)
6,"Amazing Spider-Man, The (2012)"
7,Ant-Man (2015)
8,Ant-Man and the Wasp (2018)
9,Atomic Blonde (2017)


In [52]:
def recommend(title):
    a = movies_list.index(title)
    corr_a = corr_mat[a]
    b = pd.DataFrame(movies_names[(corr_a < 1.0) & (corr_a >0.95)][:10])
    return b

In [57]:
recommend('Ant-Man (2015)')

Unnamed: 0,title
0,"Amazing Spider-Man, The (2012)"
1,Ant-Man and the Wasp (2018)
2,"Avengers, The (2012)"
3,Avengers: Age of Ultron (2015)
4,Avengers: Infinity War - Part I (2018)
5,Avengers: Infinity War - Part II (2019)
6,Batman v Superman: Dawn of Justice (2016)
7,Black Panther (2017)
8,Captain America: Civil War (2016)
9,Captain America: The First Avenger (2011)
