# Exercises for "Hands-on with Pydata: How to Build a Minimal Recommendation Engine"

# Systems check: imports and files

In [91]:
import numpy as np
import pandas as pd

# Pandas questions: Series and DataFrames
## 1. Adding a column in a DataFrame

In [92]:
# given the following DataFrame, add a new column to it
df = pd.DataFrame({'col1': [1,2,3,4]})
df['col2'] =[4,5,6,7]
df

Unnamed: 0,col1,col2
0,1,4
1,2,5
2,3,6
3,4,7


## 2. Deleting a row in a DataFrame

In [93]:
# given the following DataFrame, delete row 'd' from it
df = pd.DataFrame({'col1': [1,2,3,4]}, index = ['a','b','c','d'])
df.drop('d',axis=0)

Unnamed: 0,col1
a,1
b,2
c,3


## 3. Creating a DataFrame from a few Series

In [94]:
# given the following three Series, create a DataFrame such that it holds them as its columns
ser_1 = pd.Series(np.random.randn(6))
ser_2 = pd.Series(np.random.randn(6))
ser_3 = pd.Series(np.random.randn(6))
df_3 = pd.DataFrame({'col1':ser_1,'col2':ser_2,'col3':ser_3})
df_3

Unnamed: 0,col1,col2,col3
0,0.091107,0.005918,1.523355
1,0.261565,0.071596,0.349418
2,-1.275108,1.170584,0.632453
3,-0.380555,-0.173604,0.666172
4,0.497172,-2.441218,-0.529677
5,1.238214,0.380713,-0.0228


# Pandas questions: Indexing

## 1. Indexing into a specific column

In [96]:
# given the following DataFrame, try to index into the 'col_2' column
df = pd.DataFrame(data={'col_1': [0.12, 7, 45, 10], 'col_2': [0.9, 9, 34, 11]},
                  columns=['col_1', 'col_2', 'col_3'],
                  index=['obs1', 'obs2', 'obs3', 'obs4'])
df['col_2']
#df

obs1     0.9
obs2     9.0
obs3    34.0
obs4    11.0
Name: col_2, dtype: float64

## 2. Label-based indexing

In [97]:
# using the same DataFrame, index into the row whose index is 'obs3'
df.loc['obs3']

col_1     45
col_2     34
col_3    NaN
Name: obs3, dtype: object

## 2. Position-based indexing

In [98]:
# using the same DataFrame, index into into its first row
df.iloc[0]

col_1    0.12
col_2     0.9
col_3     NaN
Name: obs1, dtype: object

# Mini-Challenge prep: data loading

## 1. How to load the `users` and `movies` portions of MovieLens

In [121]:
import pandas as pd

users = pd.read_table('data/ml-1m/users.dat',
                      sep='::', header=None,
                      names=['user_id', 'gender', 'age', 'occupation', 'zip']
                     ,engine='python')

movies = pd.read_table('data/ml-1m/movies.dat',
                       sep='::', header=None,
                       names=['movie_id', 'title', 'genres'],engine='python')
ratings = pd.read_table('data/ml-1m/ratings.dat',
                       sep='::', header=None,
            names=['user_id', 'movie_id', 'rating', 'timestamp'],engine='python')

In [122]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## 2. How to load the training and testing subsets

In [123]:
# subset version (hosted notebook)
movielens_train = pd.read_csv('data/movielens_train.csv', index_col=0, encoding='latin1')
movielens_test = pd.read_csv('data/movielens_test.csv', index_col=0, encoding='latin1')

In [124]:
movielens_train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
593263,3562,3798,4,967332344,F,25,6,32812,What Lies Beneath (2000),Thriller,False
235597,1051,3793,4,974958593,F,25,0,60513,X-Men (2000),Action|Sci-Fi,False
219003,3727,2366,3,966309522,M,35,7,74401,King Kong (1933),Action|Adventure|Horror,False
685090,4666,1094,3,963843918,M,35,1,53704,"Crying Game, The (1992)",Drama|Romance|War,False
312377,3261,1095,4,968251750,M,45,20,87505,Glengarry Glen Ross (1992),Drama,False


In [125]:
movielens_test.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
693323,4653,2648,4,975532459,M,35,12,95051,Frankenstein (1931),Horror,False
24177,2259,1270,4,974591524,F,56,16,70503,Back to the Future (1985),Comedy|Sci-Fi,False
202202,3032,1378,5,970343147,M,25,0,47303,Young Guns (1988),Action|Comedy|Western,False
262003,3029,2289,4,972846393,M,18,4,92037,"Player, The (1992)",Comedy|Drama,False
777848,4186,2403,3,1017931262,M,25,7,33308,First Blood (1982),Action,False


# Mini-Challenge prep: evaluation functions

These are the two functions that you will need to test your `estimate` method.

In [126]:
movielens_test.user_id.head()

693323    4653
24177     2259
202202    3032
262003    3029
777848    4186
Name: user_id, dtype: int64

In [127]:
movielens_test.movie_id.head()

693323    2648
24177     1270
202202    1378
262003    2289
777848    2403
Name: movie_id, dtype: int64

In [128]:
ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)
#list(ids_to_estimate)

In [129]:
#li = [(u,i) for (u,i) in ids_to_estimate]


In [207]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

In [208]:
ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)
#estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])


In [209]:
def evaluate(estimate_f):
    """ RMSE-based predictive performance evaluation with pandas. """
    
    ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)
    #print(list(ids_to_estimate))
    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
    print(estimated)
    #print(len(estimated))
    real = movielens_test.rating.values
    return compute_rmse(estimated, real)

Test a dummy solution!

In [210]:
def my_estimate_func(user_id, movie_id):
    return 3.0

You can test for performance with the following line, which assumes that your function is called `my_estimate_func`:

In [211]:
print('RMSE for my estimate function: %s' % evaluate(my_estimate_func))

[ 3.  3.  3. ...,  3.  3.  3.]
RMSE for my estimate function: 1.23237195265


# Reco systems questions: Minimal reco engine v1.0

## 1. Simple collaborative filtering using mean ratings

In [212]:
df = pd.DataFrame([[1, 2], [3, 4]], index=['A', 'B'], columns=['X', 'Y'])
df.loc[[True, False]]

Unnamed: 0,X,Y
A,1,2


In [213]:
#~np.isnan(df['X'])

In [214]:
movielens_train.head()
movielens_train[movielens_train['movie_id'] == 2648]

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
693112,1717,2648,5,974706281,F,50,6,30307,Frankenstein (1931),Horror,False
693286,4260,2648,3,965322649,M,25,16,59079,Frankenstein (1931),Horror,False


In [230]:

len(movielens_train[['user_id','movie_id']])

5838

In [231]:
# write an 'estimate' function that computes the mean rating of a particular user
def collab_mean(user_id, movie_id):
    # first, index into all ratings of this movie
    # second, compute the mean of those ratings
    movie_cond = movielens_train['movie_id']==movie_id
    #print(movie_cond.sum())
    #print(pd.DataFrame(ratings))
    mean_rating = movielens_train.loc[movie_cond,'rating'].mean()
    return mean_rating

    
# try it out for a user_id, movie_id pair
collab_mean(4653, 2648)

4.0

In [215]:
movielens_train[movielens_train['movie_id']==2648]

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
693112,1717,2648,5,974706281,F,50,6,30307,Frankenstein (1931),Horror,False
693286,4260,2648,3,965322649,M,25,16,59079,Frankenstein (1931),Horror,False


In [216]:
user_condition = movielens_train['user_id'] !=4653
movie_cond = movielens_train['movie_id']== 2648
rating_by_others = movielens_train.loc[user_condition & movie_cond]
rating_by_others

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
693112,1717,2648,5,974706281,F,50,6,30307,Frankenstein (1931),Horror,False
693286,4260,2648,3,965322649,M,25,16,59079,Frankenstein (1931),Horror,False


In [234]:
def collab_mean(user_id, movie_id):
    # first, index into all ratings of this movie
    # second, compute the mean of those ratings
    user_condition = movielens_train['user_id'] !=user_id
    movie_cond = movielens_train['movie_id']== movie_id
    #print(movie_cond.sum())
    #print(pd.DataFrame(ratings))
    rating_by_others = movielens_train.loc[user_condition & movie_cond]
    #print(rating_by_others.rating)
    #mean_rating = movielens_train.loc[movie_cond,'rating'].mean()
    if rating_by_others.empty:
        return 3.0
    else:
        return rating_by_others.rating.mean()

    
# try it out for a user_id, movie_id pair
print(collab_mean(4653, 2648))
print('RMSE %s'%evaluate(collab_mean))

4.0
[ 4.          4.          3.57142857 ...,  4.          4.4         2.16666667]
RMSE 1.1234279896


In [235]:
movielens_test[movielens_test['movie_id'] == 2648]

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
693323,4653,2648,4,975532459,M,35,12,95051,Frankenstein (1931),Horror,False


In [151]:
movielens_train

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
593263,3562,3798,4,967332344,F,25,6,32812,What Lies Beneath (2000),Thriller,False
235597,1051,3793,4,974958593,F,25,0,60513,X-Men (2000),Action|Sci-Fi,False
219003,3727,2366,3,966309522,M,35,7,74401,King Kong (1933),Action|Adventure|Horror,False
685090,4666,1094,3,963843918,M,35,1,53704,"Crying Game, The (1992)",Drama|Romance|War,False
312377,3261,1095,4,968251750,M,45,20,87505,Glengarry Glen Ross (1992),Drama,False
916102,1139,3317,5,987819143,M,25,1,93420-2852,Wonder Boys (2000),Comedy|Drama,False
757805,2507,3707,2,974082793,M,25,4,94107,Nine 1/2 Weeks (1986),Drama,False
216517,2664,1036,5,973455698,M,35,7,52402,Die Hard (1988),Action|Thriller,False
28065,5792,527,5,958082168,M,25,17,43201,Schindler's List (1993),Drama|War,False
284940,3395,2959,3,967487438,M,25,7,02134,Fight Club (1999),Drama,False


# Mini-Challenge: first round
Implement an `estimate` function of your own using other similarity notions, eg.:

- collaborative filter based on age similarities
- collaborative filter based on zip code similarities
- collaborative filter based on occupation similarities
- content filter based on movie genre

In [117]:
user_info = users.set_index('user_id')
user_info.head(5)

Unnamed: 0_level_0,gender,age,occupation,zip
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,F,1,10,48067
2,M,56,16,70072
3,M,25,15,55117
4,M,45,7,2460
5,M,25,20,55455


In [236]:
#1. GENDER
class CollabGenderReco:
    """ Collaborative filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_gender = movielens_train.pivot_table('rating', 
                                index='movie_id', columns='gender')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same gender. """
        
        if movie_id not in self.means_by_gender.index: 
            return 3.0
        
        user_gender = user_info.ix[user_id, 'gender']
        if ~np.isnan(self.means_by_gender.ix[movie_id, user_gender]):
            return self.means_by_gender.ix[movie_id, user_gender]
        else:
            return self.means_by_gender.ix[movie_id].mean()

reco = CollabGenderReco()
reco.learn()
print('RMSE for CollabGenderReco: %s' % evaluate(reco.estimate))

[ 3.          5.          3.66666667 ...,  3.83333333  4.33333333
  2.16666667]
RMSE for CollabGenderReco: 1.17400824171


In [238]:
#Age
class CollabGenderReco:
    """ Collaborative filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_age = movielens_train.pivot_table('rating', 
                                index='movie_id', columns='age')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same gender. """
        
        if movie_id not in self.means_by_age.index: 
            return 3.0
        
        user_age = user_info.ix[user_id, 'age']
        if ~np.isnan(self.means_by_age.ix[movie_id, user_age]):
            return self.means_by_age.ix[movie_id, user_age]
        else:
            return self.means_by_age.ix[movie_id].mean()

reco = CollabGenderReco()
reco.learn()
print('RMSE for CollabAgeReco: %s' % evaluate(reco.estimate))

[ 4.          4.          3.6        ...,  3.          4.66666667  2.375     ]
RMSE for CollabGenderReco: 1.20520133441


In [240]:
#Zip
class CollabGenderReco:
    """ Collaborative filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_zip = movielens_train.pivot_table('rating', 
                                index='movie_id', columns='zip')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same gender. """
        
        if movie_id not in self.means_by_zip.index: 
            return 3.0
        
        user_zip = user_info.ix[user_id, 'zip']
        if ~np.isnan(self.means_by_zip.ix[movie_id, user_zip]):
            return self.means_by_zip.ix[movie_id, user_zip]
        else:
            return self.means_by_zip.ix[movie_id].mean()

reco = CollabGenderReco()
reco.learn()
print('RMSE for CollabZipReco: %s' % evaluate(reco.estimate))

[ 4.          4.          3.57142857 ...,  4.          4.4         2.16666667]
RMSE for CollabGenderReco: 1.12566403192


In [241]:
#Occupation
class CollabOccupationReco:
    """ Collaborative filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_occupation = movielens_train.pivot_table('rating', index='movie_id', columns='occupation')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same occupation. """
        
        if movie_id not in self.means_by_occupation.index: 
            return 3.0
        
        user_occupation = user_info.ix[user_id, 'occupation']
        if ~np.isnan(self.means_by_occupation.ix[movie_id, user_occupation]):
            return self.means_by_occupation.ix[movie_id, user_occupation]
        else:
            return self.means_by_occupation.ix[movie_id].mean()

reco = CollabOccupationReco()
reco.learn()
print('RMSE for CollabOccupationReco: %s' % evaluate(reco.estimate))

[ 4.          4.          3.         ...,  4.          4.14285714
  1.83333333]
RMSE for CollabOccupationReco: 1.20287696436


In [267]:
#Occupation
class ContentGeneresReco:
    """ Collaborative filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_userId = movielens_train.pivot_table('rating',
                            index='user_id', columns='genres',
                                                 fill_value=0)
    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same occupation. """
        
        if user_id not in self.means_by_userId.index: 
            return 0.0
        user_condition = movielens_train.user_id == user_id
        return movielens_train.loc[user_condition, 'rating'].mean()
        
        
        

reco = ContentGeneresReco()
reco.learn()
print('RMSE for CollabOccupationReco: %s' % evaluate(reco.estimate))

[ 3.66666667  4.5         4.28571429 ...,  4.          4.5         4.        ]
RMSE for CollabOccupationReco: 1.23078247597


# Mini-Challenge: second round
Implement an `estimate` function of your own using other custom similarity notions, eg.:

- euclidean
- cosine

In [268]:
def euclidean(s1, s2):
    """Take two pd.Series objects and return their euclidean 'similarity'."""
    diff = s1 - s2
    return 1 / (1 + np.sqrt(np.sum(diff ** 2)))

In [519]:
euclidean(nan,nan)

NameError: name 'nan' is not defined

In [273]:
#all_user_profiles = movielens_train.pivot_table('rating',
                            #index='movie_id', columns='user_id')
#all_user_profiles.head()

IndentationError: unexpected indent (<ipython-input-273-4ea92ce7bc95>, line 2)

In [278]:
#movielens_train.movie_id

In [346]:
all_user_profiles = movielens_train.pivot_table('rating',
                            index='movie_id', columns='user_id')
user_condition = movielens_train.user_id != 4666
movie_condition = movielens_train.movie_id == 3793
ratings_by_others = movielens_train.loc[user_condition & movie_condition]
ratings_by_others.set_index('user_id', inplace=True)
#ratings_by_others.head()
their_ids = ratings_by_others.index
their_ratings = ratings_by_others.rating
their_profiles = all_user_profiles[their_ids]
user_profile = all_user_profiles[4666]
user_profile = all_user_profiles[4666]

sims=their_profiles.apply(lambda profile: euclidean(profile, 
                                            user_profile), axis=0)
#x=lambda profile: euclidean(profile, user_profile)
#sims

                            
#their_ratings
#their_ratings
#ratings_by_others.index

user_id
1051    1.0
2818    1.0
195     1.0
3953    1.0
1733    1.0
5536    1.0
1010    1.0
4117    1.0
271     1.0
dtype: float64

In [518]:
user_profile

movie_id
1      NaN
2      NaN
4      NaN
5      NaN
6      NaN
7      NaN
10     NaN
11     NaN
12     NaN
13     NaN
15     NaN
16     NaN
17     NaN
18     NaN
19     NaN
20     NaN
21     NaN
24     NaN
25     NaN
28     NaN
29     NaN
31     NaN
32     NaN
34     NaN
36     NaN
38     NaN
39     NaN
40     NaN
41     NaN
42     NaN
        ..
3871   NaN
3872   NaN
3873   NaN
3877   NaN
3882   NaN
3886   NaN
3893   NaN
3896   NaN
3897   NaN
3901   NaN
3903   NaN
3908   NaN
3909   NaN
3910   NaN
3916   NaN
3917   NaN
3920   NaN
3921   NaN
3925   NaN
3927   NaN
3928   NaN
3929   NaN
3930   NaN
3932   NaN
3943   NaN
3945   NaN
3947   NaN
3948   NaN
3949   NaN
3952   NaN
Name: 4666, dtype: float64

In [382]:

#len(their_profiles)
their_profiles

user_id,1051,2818,195,3953,1733,5536,1010,4117,271
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,,,,,,,,,
2,,,,,,,,,
4,,,,,,,,,
5,,,,,,,,,
6,,,,,,,,,
7,,,,,,,,,
10,,,,,,,,,
11,,,,,,,,,
12,,,,,,,,,
13,,,,,,,,,


In [380]:
#their_profiles[their_profiles['user_id']==1051]
#their_profiles.query('movie_id==[500,300,700]')
#table.query('Status == ["pending","won"]')


nan

In [348]:
user_profile = all_user_profiles[1051]
user_profile.unique()

array([ nan,   1.,   5.,   3.,   4.])

In [589]:
class CollabEuclideanReco:
    """ Collaborative filtering using a custom sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = movielens_train.pivot_table('rating',
                                index='movie_id', columns='user_id')

    def estimate(self, user_id, movie_id):
        """ Ratings weighted by correlation similarity. """
        
        user_condition = movielens_train.user_id != user_id
        movie_condition = movielens_train.movie_id == movie_id
        ratings_by_others = movielens_train.loc[user_condition & movie_condition]
        #print(ratings_by_others.rating)
        if ratings_by_others.empty: 
            return 3.0
        #print(ratings_by_others.head())
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: euclidean(profile, user_profile), axis=0)
        #print(sims)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        #print(ratings_sims.head())
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.rating, weights=ratings_sims.sim)
        
reco = CollabEuclideanReco()
reco.learn()
print('RMSE for CollabEuclideanReco: %s' % evaluate(reco.estimate))

[ 4.          4.          3.57142857 ...,  4.          4.4         2.16666667]
RMSE for CollabEuclideanReco: 1.123429476


In [590]:
def cosine(s1, s2):
    """Take two pd.Series objects and return their cosine similarity."""
    return np.sum(s1 * s2) / np.sqrt(np.sum(s1 ** 2) * np.sum(s2 ** 2))

In [591]:
class CollabCosineReco:
    """ Collaborative filtering using a custom sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = movielens_train.pivot_table('rating', index='movie_id', columns='user_id')

    def estimate(self, user_id, movie_id):
        """ Ratings weighted by correlation similarity. """
        
        user_condition = movielens_train.user_id != user_id
        movie_condition = movielens_train.movie_id == movie_id
        ratings_by_others = movielens_train.loc[user_condition & movie_condition]
        if ratings_by_others.empty: 
            return 3.0
        
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: cosine(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.rating, weights=ratings_sims.sim)
        
reco = CollabCosineReco()
reco.learn()
print('RMSE for CollabCosineReco: %s' % evaluate(reco.estimate))

[ 4.          4.          3.57142857 ...,  4.          4.4         2.16666667]
RMSE for CollabCosineReco: 1.13439921696


In [523]:
#movielens_train

In [571]:
#movielens_train[movielens_train.movie_id==3798]

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
593263,3562,3798,4,967332344,F,25,6,32812,What Lies Beneath (2000),Thriller,False
593269,3618,3798,3,966598357,M,56,17,22657,What Lies Beneath (2000),Thriller,False


In [587]:
all_user_profiles = movielens_train.pivot_table('rating',
                                index='movie_id', columns='user_id')
#all_user_profiles[[1051, 2818, 195, 3953, 1733, 5536, 1010, 4117, 271]]
all_user_profiles.iloc[[1,2,3,4,5]][19]

movie_id
2   NaN
4   NaN
5   NaN
6   NaN
7   NaN
Name: 19, dtype: float64

In [None]:
for j in all_user_profiles.iloc[5]:
    if j == 3:
        print(j)

In [555]:
for i in all_user_profiles:
    for j in all_user_profiles.iloc[i]:
        if j == 3:
            print((i,j))
    
    

(5, 3.0)
(13, 3.0)
(13, 3.0)
(18, 3.0)
(18, 3.0)
(18, 3.0)
(18, 3.0)
(18, 3.0)
(18, 3.0)
(19, 3.0)
(26, 3.0)
(26, 3.0)
(26, 3.0)
(31, 3.0)
(31, 3.0)
(33, 3.0)
(33, 3.0)
(33, 3.0)
(35, 3.0)
(35, 3.0)
(42, 3.0)
(45, 3.0)
(48, 3.0)
(53, 3.0)
(53, 3.0)
(58, 3.0)
(62, 3.0)
(75, 3.0)
(83, 3.0)
(90, 3.0)
(90, 3.0)
(97, 3.0)
(117, 3.0)
(118, 3.0)
(131, 3.0)
(131, 3.0)
(131, 3.0)
(134, 3.0)
(137, 3.0)
(139, 3.0)
(146, 3.0)
(148, 3.0)
(161, 3.0)
(161, 3.0)
(163, 3.0)
(165, 3.0)
(165, 3.0)
(166, 3.0)
(166, 3.0)
(169, 3.0)
(173, 3.0)
(173, 3.0)
(175, 3.0)
(176, 3.0)
(181, 3.0)
(181, 3.0)
(181, 3.0)
(181, 3.0)
(181, 3.0)
(187, 3.0)
(190, 3.0)
(192, 3.0)
(193, 3.0)
(193, 3.0)
(193, 3.0)
(195, 3.0)
(198, 3.0)
(202, 3.0)
(203, 3.0)
(203, 3.0)
(203, 3.0)
(203, 3.0)
(203, 3.0)
(204, 3.0)
(205, 3.0)
(205, 3.0)
(214, 3.0)
(216, 3.0)
(222, 3.0)
(225, 3.0)
(230, 3.0)
(231, 3.0)
(231, 3.0)
(231, 3.0)
(233, 3.0)
(235, 3.0)
(235, 3.0)
(235, 3.0)
(236, 3.0)
(241, 3.0)
(245, 3.0)
(245, 3.0)
(255, 3.0)
(255, 3.0)

IndexError: single positional indexer is out-of-bounds

In [582]:
all_user_profiles = movielens_train.pivot_table('rating',
                                index='movie_id', columns='user_id')
#print(all_user_profiles[10].unique())
def estimate(user_id, movie_id):
    
    """ Ratings weighted by correlation similarity. """
        
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    #print(pd.DataFrame(ratings_by_others.head()))
    if ratings_by_others.empty: 
        return 3.0
    #print(ratings_by_others.head())
    ratings_by_others.set_index('user_id', inplace=True)
    #print(ratings_by_others.head())
    their_ids = ratings_by_others.index
    print(their_ids)
    their_ratings = ratings_by_others.rating
    their_profiles = all_user_profiles[their_ids]
    #print(their_profiles)
    user_profile = all_user_profiles[user_id]
    #print(user_profile)
    sims = their_profiles.apply(lambda profile: euclidean(profile, user_profile), axis=0)
    #print(sims)
    ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
    #print(ratings_sims)
    ratings_sims = ratings_sims[ratings_sims.sim > 0]
    if ratings_sims.empty:
        return their_ratings.mean()
    else:
        return np.average(ratings_sims.rating, weights=ratings_sims.sim)

In [583]:
#print(all_user_profiles)

In [584]:
estimate(4666, 3793)

Int64Index([1051, 2818, 195, 3953, 1733, 5536, 1010, 4117, 271], dtype='int64', name='user_id')


3.7777777777777777

In [515]:
df_46 =pd.DataFrame(all_user_profiles[4666])

df_46[df_46[4666].notnull()]

Unnamed: 0_level_0,4666
movie_id,Unnamed: 1_level_1
1094,3.0


In [516]:
estimate(4666, 3793)

3.7777777777777777

In [501]:
user_condition = movielens_train.user_id != 4666
movie_condition = movielens_train.movie_id == 3793
ratings_by_others = movielens_train.loc[user_condition & movie_condition]
#print(pd.DataFrame(ratings_by_others))
    
#print(ratings_by_others.head())
ratings_by_others.set_index('user_id', inplace=True)
#print(ratings_by_others.head())
their_ids = ratings_by_others.index
print(their_ids)
their_ratings = ratings_by_others.rating
their_profiles = all_user_profiles[their_ids]
user_profile = all_user_profiles[4666]
their_profiles.apply(lambda profile: euclidean(profile, user_profile), axis=0)

Int64Index([1051, 2818, 195, 3953, 1733, 5536, 1010, 4117, 271], dtype='int64', name='user_id')


user_id
1051    1.0
2818    1.0
195     1.0
3953    1.0
1733    1.0
5536    1.0
1010    1.0
4117    1.0
271     1.0
dtype: float64

In [434]:
all_user_profiles = movielens_train.pivot_table('rating',
                                index='movie_id', columns='user_id')
all_user_profiles[[1051, 2818, 195, 3953, 1733, 5536, 1010, 4117, 271]]

user_id,1051,2818,195,3953,1733,5536,1010,4117,271
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,,,,,,,,,
2,,,,,,,,,
4,,,,,,,,,
5,,,,,,,,,
6,,,,,,,,,
7,,,,,,,,,
10,,,,,,,,,
11,,,,,,,,,
12,,,,,,,,,
13,,,,,,,,,


In [486]:
df_271 =pd.DataFrame(all_user_profiles[4117])
df_271[df_271[4117].notnull()]

Unnamed: 0_level_0,4117
movie_id,Unnamed: 1_level_1
3793,5.0


In [468]:
df_4666 =pd.DataFrame(all_user_profiles[4666])
df_4666[df_4666[4666].notnull()]

Unnamed: 0_level_0,4666
movie_id,Unnamed: 1_level_1
1094,3.0


In [464]:
result = df_271.append(df_4666)
result.dropna()


Unnamed: 0_level_0,271,4666
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1


In [466]:
all_user_profiles[271].unique()

array([ nan,   2.,   4.,   5.])

In [491]:
their_profiles.apply(lambda profile: euclidean(profile, user_profile), axis=0)

user_id
1051    1.000000
2818    0.500000
195     0.500000
3953    1.000000
1733    0.333333
5536    0.500000
1010    0.500000
4117    0.500000
271     0.500000
dtype: float64