# User-Based Collaborative Filtering Test
### Katelyn Stringer, April 14, 2018

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils import *
%matplotlib inline

### Read in and format the data

In [3]:
ratings = pd.read_csv('data/ratings.csv',header=0)
ratings.tail()

Unnamed: 0,userId,movieId,rating,timestamp
20000258,138493,68954,4.5,1258126920
20000259,138493,69526,4.5,1259865108
20000260,138493,69644,3.0,1260209457
20000261,138493,70286,5.0,1258126944
20000262,138493,71619,2.5,1255811136


First, this is way too much data to test. Let's grab a subset of the data for testing purposes.

In [4]:
### Sort the values
ratings.sort_values('userId',inplace=True)

In [5]:
### Create a list of unique userId's and select the first 1000.
userlist, userind = np.unique(ratings.userId.values,return_index=True)

In [6]:
### Select the first 1000
small = ratings.iloc[:userind[1000]]

In [7]:
print(len(small))
small.head()

150629


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
112,1,3997,3.5,1112486192
113,1,4011,4.0,1112485406
114,1,4027,4.0,1112485460
115,1,4105,3.5,1094786104


We see that the format of this data is a row per rating with 4 columns. We need to reshape this into a useful matrix format to build our recommendation engine. We can use pivot for this.

In [8]:
test = small.pivot_table(index='movieId',columns='userId',values='rating')
test.head()

userId,1,2,3,4,5,6,7,8,9,10,...,991,992,993,994,995,996,997,998,999,1000
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,4.0,,,5.0,,4.0,,4.0,...,,,,,,,,,,
2,3.5,,,,3.0,,,,,,...,,,,,0.5,,,,,
3,,4.0,,,,3.0,3.0,5.0,,,...,,,,,,,,,3.0,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,1.0,


### Calculate similarity between users

Now we have a really sparse matrix indexed by movieId, filled with all of the ratings values for each user (columns). Since we set the users as columns, we can use pandas' "corr" function to determine the correlation between different users.

In [9]:
testcorr = test.corr()

This returned a 1000x1000 matrix showing how the different users are correlated to each other. Note all of the NaN's where there was insufficient overlapping ratings for the same movies to calculate the correlations.

### Predict movie ratings for one user

Now let's try to fill in some of the missing values for each user based on the similarity to other users. First, we extract the first user and the similarity values.

In [10]:
user1 = test[1]
print(user1.head())

### Locate all movies that this user rated and ignore nans.
moviesrated = np.unique(user1[user1.values>0].index)
print(len(user1),len(moviesrated))
print(moviesrated[:5])

movieId
1    NaN
2    3.5
3    NaN
4    NaN
5    NaN
Name: 1, dtype: float64
9746 175
[ 2 29 32 47 50]


  """


In [11]:
### Pull out the rows for the movies not rated by user 1.
unknown_movies = test[~test.index.isin(moviesrated)]
print(unknown_movies.head())
print(len(unknown_movies))

userId   1     2     3     4     5     6     7     8     9     10    ...   \
movieId                                                              ...    
1         NaN   NaN   4.0   NaN   NaN   5.0   NaN   4.0   NaN   4.0  ...    
3         NaN   4.0   NaN   NaN   NaN   3.0   3.0   5.0   NaN   NaN  ...    
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...    
5         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...    
6         NaN   NaN   NaN   3.0   NaN   NaN   NaN   3.0   NaN   NaN  ...    

userId   991   992   993   994   995   996   997   998   999   1000  
movieId                                                              
1         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   3.0   NaN  
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
5         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   1.0   NaN  
6         NaN   NaN   NaN   NaN   NaN   

In [20]:
### Pull out similarities to other users for this one user
tempsim = testcorr.iloc[0]
tempsim.head()

userId
1    1.000000
2   -0.069338
3    0.225312
4         NaN
5    0.467677
Name: 1, dtype: float64

In [53]:
### Multiply these movies by the weights from other users
temp_weights = np.transpose(tempsim)*unknown_movies
temp_weights.head()

userId,1,2,3,4,5,6,7,8,9,10,...,991,992,993,994,995,996,997,998,999,1000
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,0.901248,,,,,0.0,,2.115042,...,,,,,,,,,,
3,,-0.27735,,,,,0.240854,0.0,,,...,,,,,,,,,1.5,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,0.5,
6,,,,,,,,0.0,,,...,,,,,,,,1.363428,1.5,


In [54]:
### Sum up all of the weights for each row
temp_weights['sum'] = temp_weights.sum(axis=1)
temp_weights.head()

userId,1,2,3,4,5,6,7,8,9,10,...,992,993,994,995,996,997,998,999,1000,sum
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,0.901248,,,,,0.0,,2.115042,...,,,,,,,,,,130.653606
3,,-0.27735,,,,,0.240854,0.0,,,...,,,,,,,,1.5,,13.747725
4,,,,,,,,,,,...,,,,,,,,,,1.839425
5,,,,,,,,,,,...,,,,,,,,0.5,,20.790233
6,,,,,,,,0.0,,,...,,,,,,,1.363428,1.5,,39.622798


In [55]:
### Make a mask of NaNs to apply to sim matrix
nanmask = np.isfinite(temp_weights)*1
nansim = np.transpose(tempsim)*nanmask
print(nansim.head())

### Notice that the Nan values now do not have a similarity weight.

userId     1         2         3   4    5   6         7    8    9        10  \
movieId                                                                       
1        0.0 -0.000000  0.225312 NaN  0.0 NaN  0.000000  0.0  0.0  0.528761   
3        0.0 -0.069338  0.000000 NaN  0.0 NaN  0.080285  0.0  0.0  0.000000   
4        0.0 -0.000000  0.000000 NaN  0.0 NaN  0.000000  0.0  0.0  0.000000   
5        0.0 -0.000000  0.000000 NaN  0.0 NaN  0.000000  0.0  0.0  0.000000   
6        0.0 -0.000000  0.000000 NaN  0.0 NaN  0.000000  0.0  0.0  0.000000   

userId  ...   992  993  994  995  996  997       998  999  1000  sum  
movieId ...                                                           
1       ...   NaN -0.0  0.0 -0.0  0.0  0.0  0.000000  0.0   0.0  NaN  
3       ...   NaN -0.0  0.0 -0.0  0.0  0.0  0.000000  0.5   0.0  NaN  
4       ...   NaN -0.0  0.0 -0.0  0.0  0.0  0.000000  0.0   0.0  NaN  
5       ...   NaN -0.0  0.0 -0.0  0.0  0.0  0.000000  0.5   0.0  NaN  
6       ...   NaN -0

  return this.join(other, how=how, return_indexers=return_indexers)


In [56]:
### Tack on weights to end of movie matrix
temp_weights['sumweights'] = nansim.sum(axis=1)
temp_weights.head()

userId,1,2,3,4,5,6,7,8,9,10,...,993,994,995,996,997,998,999,1000,sum,sumweights
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,0.901248,,,,,0.0,,2.115042,...,,,,,,,,,130.653606,33.123043
3,,-0.27735,,,,,0.240854,0.0,,,...,,,,,,,1.5,,13.747725,5.28543
4,,,,,,,,,,,...,,,,,,,,,1.839425,1.467021
5,,,,,,,,,,,...,,,,,,,0.5,,20.790233,7.085248
6,,,,,,,,0.0,,,...,,,,,,1.363428,1.5,,39.622798,9.772875


In [63]:
### Estimate the ratings for this user and record the statistical "strength" of the estimate
temp_weights['pred'] = [float(a)/b for a,b in zip(temp_weights['sum'],temp_weights['sumweights'])]
temp_weights['pred'].head()
temp_weights['strength'] = temp_weights.dropna().count(axis=1)

  


In [64]:
temp_weights.head()

userId,1,2,3,4,5,6,7,8,9,10,...,995,996,997,998,999,1000,sum,sumweights,pred,strength
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,0.901248,,,,,0.0,,2.115042,...,,,,,,,130.653606,33.123043,3.944493,323
3,,-0.27735,,,,,0.240854,0.0,,,...,,,,,1.5,,13.747725,5.28543,2.601061,85
4,,,,,,,,,,,...,,,,,,,1.839425,1.467021,1.253851,20
5,,,,,,,,,,,...,,,,,0.5,,20.790233,7.085248,2.934298,85
6,,,,,,,,0.0,,,...,,,,1.363428,1.5,,39.622798,9.772875,4.054365,159


In [67]:
### Get movie lists to see if these ratings actually make sense
movienames = pd.read_csv('data/movies.csv',header=0)
movienames.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [77]:
### Get the highest rated movies for the test user
movies = user1.fillna(0)

In [91]:
best = movies.loc[movies==5]
print(movies.loc[movies==5])

movieId
4993    5.0
5952    5.0
7153    5.0
8507    5.0
Name: 1, dtype: float64


In [99]:
test = pd.concat([movies,movienames.shift(1)],axis=1)
test.head()

Unnamed: 0,1,movieId,title,genres
0,,,,
1,0.0,1.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,3.5,2.0,Jumanji (1995),Adventure|Children|Fantasy
3,0.0,3.0,Grumpier Old Men (1995),Comedy|Romance
4,0.0,4.0,Waiting to Exhale (1995),Comedy|Drama|Romance


In [106]:
best = test.loc[test[1]>=5.0]
print(best)

        1  movieId                                 title              genres
4993  5.0   5088.0  Going Places (Valseuses, Les) (1974)  Comedy|Crime|Drama
5952  5.0   6050.0                            Gus (1976)     Children|Comedy
7153  5.0   7264.0              An Amazing Couple (2002)      Comedy|Romance
8507  5.0  25948.0                     Yellow Sky (1948)       Crime|Western


In [90]:
topmovies = movienames.loc[movienames.movieId==]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller
