# Exercises for "Hands-on with Pydata: How to Build a Minimal Recommendation Engine"

# Systems check: imports and files

In [1]:
import numpy as np
import pandas as pd

# Pandas questions: Series and DataFrames
## 1. Adding a column in a DataFrame

In [2]:
# given the following DataFrame, add a new column to it
df = pd.DataFrame({'col1': [1,2,3,4]})
df['col2'] =[4,5,6,7]
df

Unnamed: 0,col1,col2
0,1,4
1,2,5
2,3,6
3,4,7


## 2. Deleting a row in a DataFrame

In [3]:
# given the following DataFrame, delete row 'd' from it
df = pd.DataFrame({'col1': [1,2,3,4]}, index = ['a','b','c','d'])
df.drop('d',axis=0)

Unnamed: 0,col1
a,1
b,2
c,3


## 3. Creating a DataFrame from a few Series

In [4]:
# given the following three Series, create a DataFrame such that it holds them as its columns
ser_1 = pd.Series(np.random.randn(6))
ser_2 = pd.Series(np.random.randn(6))
ser_3 = pd.Series(np.random.randn(6))
df_3 = pd.DataFrame({'col1':ser_1,'col2':ser_2,'col3':ser_3})
df_3

Unnamed: 0,col1,col2,col3
0,-0.230525,0.124664,0.3224
1,0.319756,-0.55301,0.705074
2,-1.523321,0.069852,0.148068
3,-0.140973,-1.191655,-0.665034
4,-1.71247,0.93371,-0.909647
5,-0.53938,-0.531547,-0.218762


# Pandas questions: Indexing

## 1. Indexing into a specific column

In [5]:
# given the following DataFrame, try to index into the 'col_2' column
df = pd.DataFrame(data={'col_1': [0.12, 7, 45, 10], 'col_2': [0.9, 9, 34, 11]},
                  columns=['col_1', 'col_2', 'col_3'],
                  index=['obs1', 'obs2', 'obs3', 'obs4'])
df['col_2']
#df

obs1     0.9
obs2     9.0
obs3    34.0
obs4    11.0
Name: col_2, dtype: float64

## 2. Label-based indexing

In [6]:
# using the same DataFrame, index into the row whose index is 'obs3'
df.loc['obs3']

col_1     45
col_2     34
col_3    NaN
Name: obs3, dtype: object

## 2. Position-based indexing

In [7]:
# using the same DataFrame, index into into its first row
df.iloc[0]

col_1    0.12
col_2     0.9
col_3     NaN
Name: obs1, dtype: object

# Mini-Challenge prep: data loading

## 1. How to load the `users` and `movies` portions of MovieLens

In [8]:
import pandas as pd

users = pd.read_table('data/ml-1m/users.dat',
                      sep='::', header=None,
                      names=['user_id', 'gender', 'age', 'occupation', 'zip']
                     ,engine='python')

movies = pd.read_table('data/ml-1m/movies.dat',
                       sep='::', header=None,
                       names=['movie_id', 'title', 'genres'],engine='python')
ratings = pd.read_table('data/ml-1m/ratings.dat',
                       sep='::', header=None,
            names=['user_id', 'movie_id', 'rating', 'timestamp'],engine='python')

In [9]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## 2. How to load the training and testing subsets

In [10]:
# subset version (hosted notebook)
movielens_train = pd.read_csv('data/movielens_train.csv', index_col=0, encoding='latin1')
movielens_test = pd.read_csv('data/movielens_test.csv', index_col=0, encoding='latin1')

In [11]:
movielens_train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
593263,3562,3798,4,967332344,F,25,6,32812,What Lies Beneath (2000),Thriller,False
235597,1051,3793,4,974958593,F,25,0,60513,X-Men (2000),Action|Sci-Fi,False
219003,3727,2366,3,966309522,M,35,7,74401,King Kong (1933),Action|Adventure|Horror,False
685090,4666,1094,3,963843918,M,35,1,53704,"Crying Game, The (1992)",Drama|Romance|War,False
312377,3261,1095,4,968251750,M,45,20,87505,Glengarry Glen Ross (1992),Drama,False


In [12]:
movielens_test.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
693323,4653,2648,4,975532459,M,35,12,95051,Frankenstein (1931),Horror,False
24177,2259,1270,4,974591524,F,56,16,70503,Back to the Future (1985),Comedy|Sci-Fi,False
202202,3032,1378,5,970343147,M,25,0,47303,Young Guns (1988),Action|Comedy|Western,False
262003,3029,2289,4,972846393,M,18,4,92037,"Player, The (1992)",Comedy|Drama,False
777848,4186,2403,3,1017931262,M,25,7,33308,First Blood (1982),Action,False


# Mini-Challenge prep: evaluation functions

These are the two functions that you will need to test your `estimate` method.

In [13]:
movielens_test.user_id.head()

693323    4653
24177     2259
202202    3032
262003    3029
777848    4186
Name: user_id, dtype: int64

In [14]:
movielens_test.movie_id.head()

693323    2648
24177     1270
202202    1378
262003    2289
777848    2403
Name: movie_id, dtype: int64

In [15]:
ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)
#list(ids_to_estimate)

In [16]:
#li = [(u,i) for (u,i) in ids_to_estimate]


In [17]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

In [18]:
ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)
#estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])


In [19]:
def evaluate(estimate_f):
    """ RMSE-based predictive performance evaluation with pandas. """
    
    ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)
    #print(list(ids_to_estimate))
    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
    print(estimated)
    #print(len(estimated))
    real = movielens_test.rating.values
    return compute_rmse(estimated, real)

Test a dummy solution!

In [20]:
def my_estimate_func(user_id, movie_id):
    return 3.0

You can test for performance with the following line, which assumes that your function is called `my_estimate_func`:

In [21]:
print('RMSE for my estimate function: %s' % evaluate(my_estimate_func))

[ 3.  3.  3. ...,  3.  3.  3.]
RMSE for my estimate function: 1.23237195265


# Reco systems questions: Minimal reco engine v1.0

## 1. Simple collaborative filtering using mean ratings

In [22]:
df = pd.DataFrame([[1, 2], [3, 4]], index=['A', 'B'], columns=['X', 'Y'])
df.loc[[True, False]]

Unnamed: 0,X,Y
A,1,2


In [23]:
#~np.isnan(df['X'])

In [24]:
movielens_train.head()
movielens_train[movielens_train['movie_id'] == 2648]

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
693112,1717,2648,5,974706281,F,50,6,30307,Frankenstein (1931),Horror,False
693286,4260,2648,3,965322649,M,25,16,59079,Frankenstein (1931),Horror,False


In [25]:

len(movielens_train[['user_id','movie_id']])

5838

In [26]:
# write an 'estimate' function that computes the mean rating of a particular user
def collab_mean(user_id, movie_id):
    # first, index into all ratings of this movie
    # second, compute the mean of those ratings
    movie_cond = movielens_train['movie_id']==movie_id
    #print(movie_cond.sum())
    #print(pd.DataFrame(ratings))
    mean_rating = movielens_train.loc[movie_cond,'rating'].mean()
    return mean_rating

    
# try it out for a user_id, movie_id pair
collab_mean(4653, 2648)

4.0

In [27]:
movielens_train[movielens_train['movie_id']==2648]

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
693112,1717,2648,5,974706281,F,50,6,30307,Frankenstein (1931),Horror,False
693286,4260,2648,3,965322649,M,25,16,59079,Frankenstein (1931),Horror,False


In [28]:
user_condition = movielens_train['user_id'] !=4653
movie_cond = movielens_train['movie_id']== 2648
rating_by_others = movielens_train.loc[user_condition & movie_cond]
rating_by_others

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
693112,1717,2648,5,974706281,F,50,6,30307,Frankenstein (1931),Horror,False
693286,4260,2648,3,965322649,M,25,16,59079,Frankenstein (1931),Horror,False


In [29]:
def collab_mean(user_id, movie_id):
    # first, index into all ratings of this movie
    # second, compute the mean of those ratings
    user_condition = movielens_train['user_id'] !=user_id
    movie_cond = movielens_train['movie_id']== movie_id
    #print(movie_cond.sum())
    #print(pd.DataFrame(ratings))
    rating_by_others = movielens_train.loc[user_condition & movie_cond]
    #print(rating_by_others.rating)
    #mean_rating = movielens_train.loc[movie_cond,'rating'].mean()
    if rating_by_others.empty:
        return 3.0
    else:
        return rating_by_others.rating.mean()

    
# try it out for a user_id, movie_id pair
print(collab_mean(4653, 2648))
print('RMSE %s'%evaluate(collab_mean))

4.0
[ 4.          4.          3.57142857 ...,  4.          4.4         2.16666667]
RMSE 1.1234279896


In [30]:
movielens_test[movielens_test['movie_id'] == 2648]

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
693323,4653,2648,4,975532459,M,35,12,95051,Frankenstein (1931),Horror,False


In [31]:
movielens_train

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
593263,3562,3798,4,967332344,F,25,6,32812,What Lies Beneath (2000),Thriller,False
235597,1051,3793,4,974958593,F,25,0,60513,X-Men (2000),Action|Sci-Fi,False
219003,3727,2366,3,966309522,M,35,7,74401,King Kong (1933),Action|Adventure|Horror,False
685090,4666,1094,3,963843918,M,35,1,53704,"Crying Game, The (1992)",Drama|Romance|War,False
312377,3261,1095,4,968251750,M,45,20,87505,Glengarry Glen Ross (1992),Drama,False
916102,1139,3317,5,987819143,M,25,1,93420-2852,Wonder Boys (2000),Comedy|Drama,False
757805,2507,3707,2,974082793,M,25,4,94107,Nine 1/2 Weeks (1986),Drama,False
216517,2664,1036,5,973455698,M,35,7,52402,Die Hard (1988),Action|Thriller,False
28065,5792,527,5,958082168,M,25,17,43201,Schindler's List (1993),Drama|War,False
284940,3395,2959,3,967487438,M,25,7,02134,Fight Club (1999),Drama,False


# Mini-Challenge: first round
Implement an `estimate` function of your own using other similarity notions, eg.:

- collaborative filter based on age similarities
- collaborative filter based on zip code similarities
- collaborative filter based on occupation similarities
- content filter based on movie genre

In [30]:
user_info = users.set_index('user_id')
user_info.head()

Unnamed: 0_level_0,gender,age,occupation,zip
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,F,1,10,48067
2,M,56,16,70072
3,M,25,15,55117
4,M,45,7,2460
5,M,25,20,55455


In [31]:
user_info.ix[1,'gender']

'F'

In [32]:
#df = movielens_train[movielens_train['movie_id']==2]
#df.groupby('gender')['rating'].mean()

In [33]:
movielens_train['age'].unique()

array([25, 35, 45, 50, 18, 56,  1])

In [35]:
#1. GENDER
class CollabGenderReco:
    """ Collaborative filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_gender = movielens_train.pivot_table('rating', 
                                index='movie_id', columns='gender')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same gender. """
        
        if movie_id not in self.means_by_gender.index: 
            return 3.0
        
        user_gender = user_info.ix[user_id, 'gender']
        if ~np.isnan(self.means_by_gender.ix[movie_id, user_gender]):
            return self.means_by_gender.ix[movie_id, user_gender]
        else:
            return self.means_by_gender.ix[movie_id].mean()

reco = CollabGenderReco()
reco.learn()
print('RMSE for CollabGenderReco: %s' % evaluate(reco.estimate))

[ 3.          5.          3.66666667 ...,  3.83333333  4.33333333
  2.16666667]
RMSE for CollabGenderReco: 1.17400824171


In [34]:
#Age
class CollabGenderReco:
    """ Collaborative filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_age = movielens_train.pivot_table('rating', 
                                index='movie_id', columns='age')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same gender. """
        
        if movie_id not in self.means_by_age.index: 
            return 3.0
        
        user_age = user_info.ix[user_id, 'age']
        if ~np.isnan(self.means_by_age.ix[movie_id, user_age]):
            return self.means_by_age.ix[movie_id, user_age]
        else:
            return self.means_by_age.ix[movie_id].mean()

reco = CollabGenderReco()
reco.learn()
print('RMSE for CollabAgeReco: %s' % evaluate(reco.estimate))

[ 4.          4.          3.6        ...,  3.          4.66666667  2.375     ]
RMSE for CollabAgeReco: 1.20520133441


In [35]:
#Zip
class CollabGenderReco:
    """ Collaborative filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_zip = movielens_train.pivot_table('rating', 
                                index='movie_id', columns='zip')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same gender. """
        
        if movie_id not in self.means_by_zip.index: 
            return 3.0
        
        user_zip = user_info.ix[user_id, 'zip']
        if ~np.isnan(self.means_by_zip.ix[movie_id, user_zip]):
            return self.means_by_zip.ix[movie_id, user_zip]
        else:
            return self.means_by_zip.ix[movie_id].mean()

reco = CollabGenderReco()
reco.learn()
print('RMSE for CollabZipReco: %s' % evaluate(reco.estimate))

[ 4.          4.          3.57142857 ...,  4.          4.4         2.16666667]
RMSE for CollabZipReco: 1.12566403192


In [36]:
#Occupation
class CollabOccupationReco:
    """ Collaborative filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_occupation = movielens_train.pivot_table('rating', index='movie_id', columns='occupation')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same occupation. """
        
        if movie_id not in self.means_by_occupation.index: 
            return 3.0
        
        user_occupation = user_info.ix[user_id, 'occupation']
        if ~np.isnan(self.means_by_occupation.ix[movie_id, user_occupation]):
            return self.means_by_occupation.ix[movie_id, user_occupation]
        else:
            return self.means_by_occupation.ix[movie_id].mean()

reco = CollabOccupationReco()
reco.learn()
print('RMSE for CollabOccupationReco: %s' % evaluate(reco.estimate))

[ 4.          4.          3.         ...,  4.          4.14285714
  1.83333333]
RMSE for CollabOccupationReco: 1.20287696436


In [37]:
#Occupation
class ContentGeneresReco:
    """ Collaborative filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_userId = movielens_train.pivot_table('rating',
                            index='user_id', columns='genres',
                                                 fill_value=0)
    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same occupation. """
        
        if user_id not in self.means_by_userId.index: 
            return 0.0
        user_condition = movielens_train.user_id == user_id
        return movielens_train.loc[user_condition, 'rating'].mean()
        
        
        

reco = ContentGeneresReco()
reco.learn()
print('RMSE for CollabOccupationReco: %s' % evaluate(reco.estimate))

[ 3.66666667  4.5         4.28571429 ...,  4.          4.5         4.        ]
RMSE for CollabOccupationReco: 1.23078247597


# Mini-Challenge: second round
Implement an `estimate` function of your own using other custom similarity notions, eg.:

- euclidean
- cosine

In [38]:
def euclidean(s1, s2):
    """Take two pd.Series objects and return their euclidean 'similarity'."""
    diff = s1 - s2
    return 1 / (1 + np.sqrt(np.sum(diff ** 2)))

In [40]:
#all_user_profiles = movielens_train.pivot_table('rating',
                            #index='movie_id', columns='user_id')
#all_user_profiles.head()

In [41]:
#movielens_train.movie_id

In [589]:
class CollabEuclideanReco:
    """ Collaborative filtering using a custom sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = movielens_train.pivot_table('rating',
                                index='movie_id', columns='user_id')

    def estimate(self, user_id, movie_id):
        """ Ratings weighted by correlation similarity. """
        
        user_condition = movielens_train.user_id != user_id
        movie_condition = movielens_train.movie_id == movie_id
        ratings_by_others = movielens_train.loc[user_condition & movie_condition]
        #print(ratings_by_others.rating)
        if ratings_by_others.empty: 
            return 3.0
        #print(ratings_by_others.head())
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: euclidean(profile, user_profile), axis=0)
        #print(sims)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        #print(ratings_sims.head())
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.rating, weights=ratings_sims.sim)
        
reco = CollabEuclideanReco()
reco.learn()
print('RMSE for CollabEuclideanReco: %s' % evaluate(reco.estimate))

[ 4.          4.          3.57142857 ...,  4.          4.4         2.16666667]
RMSE for CollabEuclideanReco: 1.123429476


In [56]:
def cosine(s1, s2):
    """Take two pd.Series objects and return their cosine similarity."""
    return np.sum(s1 * s2) / np.sqrt(np.sum(s1 ** 2) * np.sum(s2 ** 2))

In [57]:
class CollabCosineReco:
    """ Collaborative filtering using a custom sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = movielens_train.pivot_table('rating', index='movie_id', columns='user_id')

    def estimate(self, user_id, movie_id):
        """ Ratings weighted by correlation similarity. """
        
        user_condition = movielens_train.user_id != user_id
        movie_condition = movielens_train.movie_id == movie_id
        ratings_by_others = movielens_train.loc[user_condition & movie_condition]
        if ratings_by_others.empty: 
            return 3.0
        
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: cosine(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.rating, weights=ratings_sims.sim)
        
reco = CollabCosineReco()
reco.learn()
print('RMSE for CollabCosineReco: %s' % evaluate(reco.estimate))

[ 4.          4.          3.57142857 ...,  4.          4.4         2.16666667]
RMSE for CollabCosineReco: 1.13439921696


In [523]:
#movielens_train

In [571]:
#movielens_train[movielens_train.movie_id==3798]

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
593263,3562,3798,4,967332344,F,25,6,32812,What Lies Beneath (2000),Thriller,False
593269,3618,3798,3,966598357,M,56,17,22657,What Lies Beneath (2000),Thriller,False
