# EDA

In [1]:
import pandas as pd

import numpy as np
from sklearn.decomposition import NMF

In [2]:
links = pd.read_csv('ml-latest-small/links.csv')

In [3]:
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


In [4]:
links.head()
# imdbId is an identifier for movies used by http://www.imdb.com.
# tmdbId is an identifier for movies used by https://www.themoviedb.org.

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [5]:
movies_df = pd.read_csv('ml-latest-small/movies.csv')

In [6]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [7]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
movies_df['genres'].nunique()

951

In [9]:
genres_list = movies_df['genres'].value_counts() > 10

In [10]:
genres_list

Drama                                                    True
Comedy                                                   True
Comedy|Drama                                             True
Comedy|Romance                                           True
Drama|Romance                                            True
                                                        ...  
Crime|Horror|Sci-Fi                                     False
Comedy|Crime|Horror|Mystery|Thriller                    False
Action|Animation|Comedy|Crime|Drama|Romance|Thriller    False
Adventure|Fantasy|Romance|Sci-Fi|Thriller               False
Adventure|Animation|Children|Sci-Fi|IMAX                False
Name: genres, Length: 951, dtype: bool

In [11]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')

In [12]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [13]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [14]:
tags = pd.read_csv('ml-latest-small/tags.csv')

In [15]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [16]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


# Implement a simple recommender

* calculate an ordered top list (with regards to the average rating) of movies for your dataset
* recommend top movies that a user has not seen yet: Implement a function recommend(user_id, top_list, ratings, k) that gets a user_id, the top list of movies and the ratings table and returns a list of k movie_ids as recommondations.

In [17]:
ratings = ratings.set_index('movieId')
movies_df = movies_df.set_index('movieId')

In [18]:
df = ratings.join(movies_df, on='movieId')

In [19]:
df.head()

Unnamed: 0_level_0,userId,rating,timestamp,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,1,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
6,1,4.0,964982224,Heat (1995),Action|Crime|Thriller
47,1,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
50,1,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [20]:
df.shape

(100836, 5)

# NMF


Adapt the following code to the MovieLens dataset:

### First, attempt on test data Movies.csv, done in class

In [21]:
df1 = pd.read_csv('Movies.csv')

In [22]:
df1.shape

(121, 3)

In [23]:
df1.head()

Unnamed: 0,name,movie,rating (1-5)
0,Kristian,The Big Lebowski,3
1,Max,The Empire strikes back,5
2,Max,The Big Lebowksi,5
3,Kristian,Memento,4
4,Sara,Green Mile,5


In [24]:
df1['name'].nunique()

12

In [25]:
df1['movie'].nunique()

62

In [26]:
matrix = df1.pivot(index='name', columns='movie', values='rating (1-5)')
matrix.head()

movie,A Million ways to die in the west,American Beauty,Blade Runner,Blade Runner 2049,Blues Brothers,Butterfly Effect,Cats,Diva,Django Unchained,"Dude, where's my car?",...,The Empire strikes back,The Farewell,The Girl With The Dragon Tattoo,The Notebook,The Phantom Menace,The Seventh Seal,The Sixth Sense,The Theory of Everything,Titanic,What happened to Monday
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Anastasia,,,,1.0,,,,4.0,,,...,,,,,,,,,,
Basti,,,,,,,,,,,...,,,,,,,,,,
Braulio,,4.0,5.0,4.0,,4.0,,,4.0,2.0,...,,,,,,,,,,
Carina,,,,,,,,,,,...,,,,,,,,,,
Felix,,5.0,,,,,,,4.0,,...,,,,,,,,,,


In [27]:
matrix.shape

(12, 62)

In [28]:
matrix = matrix.fillna(2.5)
matrix.head()

movie,A Million ways to die in the west,American Beauty,Blade Runner,Blade Runner 2049,Blues Brothers,Butterfly Effect,Cats,Diva,Django Unchained,"Dude, where's my car?",...,The Empire strikes back,The Farewell,The Girl With The Dragon Tattoo,The Notebook,The Phantom Menace,The Seventh Seal,The Sixth Sense,The Theory of Everything,Titanic,What happened to Monday
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Anastasia,2.5,2.5,2.5,1.0,2.5,2.5,2.5,4.0,2.5,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5
Basti,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5
Braulio,2.5,4.0,5.0,4.0,2.5,4.0,2.5,2.5,4.0,2.0,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5
Carina,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5
Felix,2.5,5.0,2.5,2.5,2.5,2.5,2.5,2.5,4.0,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5


In [29]:
R = matrix.values

In [30]:
len(R)

12

In [31]:
#create a model and set the hyperparameters
# model assumes R ~ PQ'
model = NMF(n_components=20, init='random', random_state=10)

model.fit(R)

Q = model.components_  # movie-genre matrix

P = model.transform(R)  # user-genre matrix

print(model.reconstruction_err_) #reconstruction error

0.008757101801593847


In [32]:
nR = np.dot(P, Q)
# print(nR) ## The reconstructed matrix!

In [33]:
nR.shape

(12, 62)

In [34]:
matrix.loc['Braulio'].values

array([2.5, 4. , 5. , 4. , 2.5, 4. , 2.5, 2.5, 4. , 2. , 2.5, 2.5, 2.5,
       2.5, 5. , 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 1. , 2.5, 2.5, 2.5, 5. ,
       2.5, 5. , 4. , 4. , 4. , 2.5, 4. , 2.5, 2.5, 5. , 4. , 2.5, 2.5,
       2.5, 2.5, 2.5, 2.5, 5. , 3. , 2.5, 2.5, 2.5, 3. , 2.5, 2.5, 4. ,
       2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5])

In [35]:
# predict the hidden features for a new data point
query = [[0, 4. , 5. , 4. , 0, 4. , 0, 0, 4. , 2. , 0, 0, 0,
       0, 5. , 2.5, 0, 2.5, 0, 2.5, 0, 1. , 0, 0, 0, 5. ,
       0, 5. , 4. , 4. , 4. , 2.5, 4. , 2.5, 2.5, 5. , 4. , 0, 2.5,
       2.5, 0, 0, 0, 5. , 3. , 2.5, 0, 2.5, 3. , 2.5, 0, 4. ,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
#in this case, a new user providing ratings for the 62 movies.
print(model.transform(query))

[[0.         0.         0.20198696 0.         0.         1.45688585
  1.27132001 0.         0.         0.         0.02366149 0.
  0.         0.         0.23029175 0.         0.06594543 0.
  0.         0.10298192]]


In [36]:
user = np.array(query)

In [37]:
user.shape

(1, 62)

In [38]:
profile = model.transform(user)   # how strongly our user likes parameters (6)

In [39]:
profile.shape

(1, 20)

In [40]:
Q.shape

(20, 62)

In [41]:
result = np.dot(profile, Q)          # how strongly our user would like all (62) movies
result

array([[1.2557165 , 4.07968163, 4.2095133 , 3.61743375, 0.24150643,
        2.64816127, 1.51773277, 0.747552  , 3.40430857, 0.81409408,
        0.71056154, 1.40005765, 1.44953383, 1.8160394 , 3.24028653,
        1.75150868, 1.20002354, 0.66829695, 1.41031699, 1.35778558,
        0.57756303, 0.48013429, 1.39938455, 1.12421649, 0.81751304,
        4.74107914, 1.39896335, 4.37901746, 4.0614008 , 4.38503417,
        4.91096773, 1.96565886, 3.2014347 , 1.60809691, 0.4996321 ,
        3.99200693, 3.35946266, 0.18639742, 1.04778477, 1.30359523,
        1.35530988, 1.74345028, 0.52869578, 4.07815007, 2.18698628,
        0.32808654, 1.5054652 , 1.57687116, 0.80730039, 1.71669267,
        1.19185452, 3.86201959, 0.45034567, 0.99838711, 1.41578714,
        1.06127473, 2.11943531, 1.35227236, 1.3682839 , 0.81397906,
        0.59073152, 0.89758052]])

In [42]:
result.shape

(1, 62)

In [43]:
movies = matrix.columns

In [44]:
s = pd.Series(result[0], index=matrix.columns)
s.sort_values(ascending=False).head(5)

movie
Lord of the Rings -- The Two Towers            4.910968
Interstellar                                   4.741079
Lord of the Rings -- The Return of the King    4.385034
Karate Kid (1984)                              4.379017
Blade Runner                                   4.209513
dtype: float64

### Now, attempt on the big dataset

In [45]:
df.shape

(100836, 5)

In [46]:
df.head(3)

Unnamed: 0_level_0,userId,rating,timestamp,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,1,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
6,1,4.0,964982224,Heat (1995),Action|Crime|Thriller


In [47]:
mm = df.groupby(['userId', 'title'])['rating'].sum().unstack()

In [48]:
mm.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [49]:
mm.shape

(610, 9719)

In [50]:
mm = mm.fillna(2.5)
mm.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,4.0,2.5
2,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5
3,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5
4,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5
5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5


In [51]:
R = mm.values

In [52]:
len(R)

610

In [53]:
#create a model and set the hyperparameters
# model assumes R ~ PQ'
m = NMF(n_components=400, init='random', random_state=10, max_iter=200)

m.fit(R)

Q = m.components_  # movie-genre matrix

P = m.transform(R)  # user-genre matrix

print(m.reconstruction_err_) #reconstruction error

KeyboardInterrupt: 

In [None]:
nR = np.dot(P, Q)
nR.shape

In [None]:
# predict the hidden features for a new data point
user = {'title' : ["Fight Club (1999)", "Pretty Woman (1990)" , "The Butterfly Effect (2004)", 
                   "Inception (2010)", "(500) Days of Summer (2009)","Devil Wears Prada, The (2006)"],
        'rating' : [5,2,4,4,2,1]}

In [None]:
user = pd.DataFrame(user)
user

In [None]:
ratings = ratings.reset_index()
Ids = ratings['movieId'].unique()

In [None]:
len(Ids)

In [None]:
ID = pd.DataFrame(Ids)

In [None]:
ID

In [None]:
movie_info = pd.merge(ID, movies_df, left_on = 0, right_on = 'movieId')

In [None]:
movie_info.info()

In [None]:
user_merge = pd.merge(movie_info, user, left_on = 'title', right_on = 'title', how = 'left')

In [None]:
user_merge.info()

In [None]:
query = user_merge['rating']

In [None]:
query = query.fillna(2.5)

In [None]:
query = np.array(query)

In [None]:
user = np.array(query)
user = user.reshape(1, -1)
user.shape

In [None]:
#profile = m.transform(user)
#profile.shape

In [None]:
Q.shape

In [None]:
result = np.dot(profile, Q)          # how strongly our user would like all movies
result.shape

In [None]:
s = pd.Series(result[0], index=mm.columns)
s.sort_values(ascending=False).head(5)