In [0]:
#SVD -Singular valued Decomposition
#SVD Matrix Factorization

#Importing Libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn.decomposition import TruncatedSVD

In [2]:
#preparing the data

columns = ['user_id', 'item_id', 'rating', 'timestamp']
frame = pd.read_csv('u.data', sep = '\t', names = columns)
frame.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [6]:
# Movie attributes data
columns1 = ['item_id', 'movie_title', 'release_date', 'video release date', 
            'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation', 
            'Childrens','Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 
            'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance','Sci-Fi', 'Thriller',
            'War', 'Western']
movies = pd.read_csv('u.item', encoding = 'latin-1',sep = '|', names = columns1)
movies.head()

Unnamed: 0,item_id,movie_title,release_date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
movie_names = pd.DataFrame(movies[['item_id','movie_title']])
movie_names.head()

Unnamed: 0,item_id,movie_title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [8]:
#Combining the above dataframes
#Contains all of the movies and the entries based on reviews by different users

combined_movies_data = pd.merge(frame, movie_names, on = 'item_id' )
combined_movies_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie_title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [9]:
#Finding which movie had the most reviews
combined_movies_data.groupby('item_id')['rating'].count().sort_values(ascending = False).head()

item_id
50     583
258    509
100    508
181    507
294    485
Name: rating, dtype: int64

In [10]:
#Finding the name of the movie with item_id = 50
movie_names[movie_names['item_id']== 50]

Unnamed: 0,item_id,movie_title
49,50,Star Wars (1977)


## Building a Utility **Matrix** 

In [12]:
#Contains value for each user and each movie

ratings_crosstab = combined_movies_data.pivot_table(values = 'rating', index = 'user_id', columns = 'movie_title', fill_value = 0)
ratings_crosstab.head()

movie_title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,2,5,0,0,3,4,0,0,...,0,0,0,5,3,0,0,0,4,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,2,0,0,0,0,4,0,0,...,0,0,0,4,0,0,0,0,4,0


# **Transposing the Utility Matrix**

In [13]:
ratings_crosstab.shape

(943, 1664)

In [14]:
X = ratings_crosstab.values.T
X.shape

(1664, 943)

# **Decomposing the Matrix**

In [15]:
SVD = TruncatedSVD(n_components = 12, random_state = 17)  ##Random state is for achieving repeatable results

resultant_matrix = SVD.fit_transform(X)
resultant_matrix.shape

(1664, 12)

# **Generating a correlation Matrix**

In [16]:
corr_mat = np.corrcoef(resultant_matrix)
corr_mat.shape

(1664, 1664)

# **Isolating the most popular movie star wars**

In [19]:
# Generating a movie names index and set it equal to the ratings_crosstab_matrix

movies_names = ratings_crosstab.columns
movies_list = list(movies_names)
star_wars = movies_list.index('Star Wars (1977)')
print (star_wars)

1398


In [22]:
corr_star_wars = corr_mat[star_wars]
corr_star_wars.shape

(1664,)

# **Recommending a highly correlated movie like star wars**

In [24]:
list(movies_names[(corr_star_wars < 1) & (corr_star_wars > 0.9)])

['Die Hard (1988)',
 'Empire Strikes Back, The (1980)',
 'Fugitive, The (1993)',
 'Raiders of the Lost Ark (1981)',
 'Return of the Jedi (1983)',
 'Terminator 2: Judgment Day (1991)',
 'Terminator, The (1984)',
 'Toy Story (1995)']

In [28]:
list(movies_names[(corr_star_wars < 1) & (corr_star_wars > 0.98)])

['Return of the Jedi (1983)']

# **So in essence machine learning tells me if i liked the first star wars, I will definitely like return of the Jedi!!!**