# Exercises for "Hands-on with Pydata: How to Build a Minimal Recommendation Engine"

# Systems check: imports and files

In [1]:
import numpy as np
import pandas as pd

# Pandas questions: Series and DataFrames
## 1. Adding a column in a DataFrame

In [2]:
# given the following DataFrame, add a new column to it
df = pd.DataFrame({'col1': [1,2,3,4]})
df

Unnamed: 0,col1
0,1
1,2
2,3
3,4


In [3]:
#answer
df['col2'] = [1,2,3,4]
df

Unnamed: 0,col1,col2
0,1,1
1,2,2
2,3,3
3,4,4


## 2. Deleting a row in a DataFrame

In [7]:
# given the following DataFrame, delete row 'd' from it
df = pd.DataFrame({'col1': [1,2,3,4]}, index = ['a','b','c','d'])
df

Unnamed: 0,col1
a,1
b,2
c,3
d,4


In [8]:
#answer
df = df.drop('d',axis=0)
df

Unnamed: 0,col1
a,1
b,2
c,3


## 3. Creating a DataFrame from a few Series

In [12]:
# given the following three Series, create a DataFrame such that it holds them as its columns
ser_1 = pd.Series(np.random.randn(6))
ser_2 = pd.Series(np.random.randn(6))
ser_3 = pd.Series(np.random.randn(6))

#answer
df = pd.DataFrame(data={'1':ser_1,'2':ser_2,'3':ser_3}, columns=['1','2','3'])
df


Unnamed: 0,1,2,3
0,0.080311,0.08417,-0.164385
1,0.660491,1.021074,-0.328203
2,-1.252693,-0.529206,0.739787
3,0.731415,-0.451606,0.275415
4,-1.297058,-0.856053,-0.267354
5,0.910341,-0.07737,0.449567


# Pandas questions: Indexing

## 1. Indexing into a specific column

In [14]:
# given the following DataFrame, try to index into the 'col_2' column
df = pd.DataFrame(data={'col_1': [0.12, 7, 45, 10], 'col_2': [0.9, 9, 34, 11]},
                  columns=['col_1', 'col_2', 'col_3'],
                  index=['obs1', 'obs2', 'obs3', 'obs4'])

df

Unnamed: 0,col_1,col_2,col_3
obs1,0.12,0.9,
obs2,7.0,9.0,
obs3,45.0,34.0,
obs4,10.0,11.0,


In [15]:
#answer
df.col_2

obs1     0.9
obs2     9.0
obs3    34.0
obs4    11.0
Name: col_2, dtype: float64

## 2. Label-based indexing

In [18]:
# using the same DataFrame, index into the row whose index is 'obs3'
df.loc['obs3']

col_1     45
col_2     34
col_3    NaN
Name: obs3, dtype: object

## 2. Position-based indexing

In [22]:
# using the same DataFrame, index into into its first row
df.iloc[0,:]

col_1    0.12
col_2     0.9
col_3     NaN
Name: obs1, dtype: object

# Mini-Challenge prep: data loading

## 1. How to load the `users` and `movies` portions of MovieLens

In [23]:
import pandas as pd

users = pd.read_table('data/ml-1m/users.dat',
                      sep='::', header=None,
                      names=['user_id', 'gender', 'age', 'occupation', 'zip'])

movies = pd.read_table('data/ml-1m/movies.dat',
                       sep='::', header=None,
                       names=['movie_id', 'title', 'genres'])



## 2. How to load the training and testing subsets

In [26]:
# subset version (hosted notebook)
movielens_train = pd.read_csv('data/movielens_train.csv', index_col=0)
movielens_test = pd.read_csv('data/movielens_test.csv', index_col=0)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 3: invalid continuation byte

In [28]:
#error with import so rerun code from pycon notebook

users = pd.read_table('data/ml-1m/users.dat',
                      sep='::', header=None, 
                      names=['user_id', 'gender', 'age', 'occupation', 'zip'])

ratings = pd.read_table('data/ml-1m/ratings.dat',
                        sep='::', header=None, 
                        names=['user_id', 'movie_id', 'rating', 'timestamp'])

movies = pd.read_table('data/ml-1m/movies.dat',
                       sep='::', header=None, 
                       names=['movie_id', 'title', 'genres'])

movielens = pd.merge(pd.merge(ratings, users), movies)
movielens.head()

movielens = movielens.ix[np.random.choice(movielens.index, size=10000, replace=False)]
user_ids_larger_1 = pd.value_counts(movielens.user_id, sort=False) > 1
user_ids_larger_1 = user_ids_larger_1[user_ids_larger_1].index
movielens = movielens.select(lambda l: movielens.loc[l, 'user_id'] in user_ids_larger_1)
assert(np.all(movielens.user_id.value_counts() > 1))

def assign_to_set(df):
    sampled_ids = np.random.choice(df.index,
                                   size=np.int64(np.ceil(df.index.size * 0.2)),
                                   replace=False)
    df.ix[sampled_ids, 'for_testing'] = True
    return df

movielens['for_testing'] = False
grouped = movielens.groupby('user_id', group_keys=False).apply(assign_to_set)
movielens_train = movielens[grouped.for_testing == False]
movielens_test = movielens[grouped.for_testing == True]



In [29]:
movielens_train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
387329,1812,1810,4,975820455,F,25,12,48103,Primary Colors (1998),Drama,False
541795,4049,2968,4,965496713,M,45,11,33133,Time Bandits (1981),Adventure|Fantasy|Sci-Fi,False
381088,4743,1735,4,963263036,M,56,13,63122,Great Expectations (1998),Drama|Romance,False
923399,2592,2803,5,974057005,M,50,7,80004-4448,"Pelican Brief, The (1993)",Thriller,False
247397,3473,224,5,967136863,F,35,16,02472,Don Juan DeMarco (1995),Comedy|Drama|Romance,False


In [30]:
movielens_test.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
896942,2092,2750,5,978303026,M,56,1,49006,Radio Days (1987),Comedy|Drama,False
231337,4981,1392,4,962599453,M,50,1,55406,Citizen Ruth (1996),Comedy|Drama,False
948179,2073,2635,1,974757389,F,18,4,13148,"Mummy's Curse, The (1944)",Horror,False
921891,1472,2505,2,974752860,M,25,7,90248,8MM (1999),Thriller,False
552840,5107,208,3,962335028,F,45,0,8081,Waterworld (1995),Action|Adventure,False


# Mini-Challenge prep: evaluation functions

These are the two functions that you will need to test your `estimate` method.

In [31]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

In [42]:
def evaluate(estimate_f):
    """ RMSE-based predictive performance evaluation with pandas. """
    
    ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)
    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
    real = movielens_test.rating.values
    return compute_rmse(estimated, real)

Test a dummy solution!

In [33]:
def my_estimate_func(user_id, movie_id):
    return 3.0

You can test for performance with the following line, which assumes that your function is called `my_estimate_func`:

In [45]:
print('RMSE for my estimate function: %s' % evaluate(my_estimate_func))

RMSE for my estimate function: 1.25820659366


# Reco systems questions: Minimal reco engine v1.0

## 1. Simple collaborative filtering using mean ratings

In [62]:
# write an 'estimate' function that computes the mean rating of a particular user
def collab_mean(user_id, movie_id):
    # first, index into all ratings of this movie
    movie_condition = movielens_train.movie_id == movie_id
    # second, compute the mean of those ratings
    return movielens_train.loc[movie_condition, 'rating'].mean()

# try it out for a user_id, movie_id pair
print(collab_mean(4653, 2648))
print('RMSE for my estimate function: %s' % evaluate(collab_mean))

#appears that the evaluate function is not working

4.5
RMSE for my estimate function: nan


# Mini-Challenge: first round
Implement an `estimate` function of your own using other similarity notions, eg.:

- collaborative filter based on age similarities
- collaborative filter based on zip code similarities
- collaborative filter based on occupation similarities
- content filter based on movie genre

In [56]:
#why not all?
user_info = users.set_index('user_id')

class CollabAllReco:
    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_gender = movielens_train.pivot_table('rating', index='movie_id', columns='gender')
        self.means_by_age = movielens_train.pivot_table('rating', index='movie_id', columns='age')
        self.means_by_zip = movielens_train.pivot_table('rating', index='movie_id', columns='zip')

    
    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same profile. """
        
        all_means = {} #Dict to store values
        
        #by gender
        if movie_id not in self.means_by_gender.index: 
            all_means['gender'] = None
        
        user_gender = user_info.ix[user_id, 'gender']
        if ~np.isnan(self.means_by_gender.ix[movie_id, user_gender]):
            all_means['gender'] = self.means_by_gender.ix[movie_id, user_gender]
        else:
            all_means['gender'] = self.means_by_gender.ix[movie_id].mean()

        if movie_id not in self.means_by_age.index: 
            all_means['age'] = None
        
        #by age
        user_age = user_info.ix[user_id, 'age']
        if ~np.isnan(self.means_by_age.ix[movie_id, user_age]):
            all_means['age'] = self.means_by_age.ix[movie_id, user_age]
        else:
            all_means['age'] = self.means_by_age.ix[movie_id].mean()

        if movie_id not in self.means_by_zip.index: 
            all_means['zip'] = None
        
        #by zip
        user_zip = user_info.ix[user_id, 'zip']
        if ~np.isnan(self.means_by_zip.ix[movie_id, user_zip]):
            all_means['zip'] = self.means_by_zip.ix[movie_id, user_zip]
        else:
            all_means['zip'] = self.means_by_zip.ix[movie_id].mean()

        #mean of means
        df = pd.Series(all_means)
        df = df.dropna(axis=0)
        return df.mean()
        
reco = CollabAllReco()
reco.learn()
reco.estimate(4653, 2648)

4.666666666666667

In [57]:
# the estimator they gave doesn't work

def evaluate(estimate_f):
    """ RMSE-based predictive performance evaluation with pandas. """
    
    ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)
    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
    real = movielens_test.rating.values
    return compute_rmse(estimated, real)

print('RMSE for CollabGenderReco: %s' % evaluate(reco.estimate))

KeyError: 2635

# Mini-Challenge: second round
Implement an `estimate` function of your own using other custom similarity notions, eg.:

- euclidean
- cosine

In [60]:
def cosine(s1, s2):
    """Take two pd.Series objects and return their cosine similarity."""
    return np.sum(s1 * s2) / np.sqrt(np.sum(s1 ** 2) * np.sum(s2 ** 2))

class CollabCosineReco:
    """ Collaborative filtering using a custom sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = movielens.pivot_table('rating', index='movie_id', columns='user_id')

    def estimate(self, user_id, movie_id):
        """ Ratings weighted by correlation similarity. """
        
        user_condition = movielens_train.user_id != user_id
        movie_condition = movielens_train.movie_id == movie_id
        ratings_by_others = movielens_train.loc[user_condition & movie_condition]
        if ratings_by_others.empty: 
            return 3.0
        
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: cosine(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.rating, weights=ratings_sims.sim)
        
reco = CollabCosineReco()
reco.learn()
reco.estimate(4653, 2648)

4.5

In [61]:
print('RMSE for CollabPearsonReco: %s' % evaluate(reco.estimate))

RMSE for CollabPearsonReco: 1.13978278502
