In [126]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from plotly.offline import plot
from plotly.graph_objs import *
import datetime
from sklearn import cross_validation as cv
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy import stats

In [127]:
user = pd.read_csv('data/u.user', delimiter='|', header=None, names=['userid', 'age', 'gender', 'occupation', 'zipcode'])
rating_data = pd.read_csv('data/u.data', delimiter='\t', header=None, names=['userid', 'itemid', 'rating', 'timestamp'])

In [128]:
movies = pd.read_csv('data/u.item', delimiter='|', encoding='ISO-8859-1', header=None, usecols=[0, 1])

In [129]:
rating_table = pd.pivot_table(rating_data, columns = 'itemid', index='userid', values='rating')

In [130]:
rating_table=rating_table.fillna(0)
mean_rating_users = rating_table.mean(axis = 1)
mean_rating_users

userid
1      0.583829
2      0.136742
3      0.089774
4      0.061831
5      0.299049
6      0.456005
7      0.950059
8      0.133175
9      0.055886
10     0.460166
11     0.372771
12     0.133175
13     1.171225
14     0.238407
15     0.177765
16     0.360285
17     0.050535
18     0.639120
19     0.042212
20     0.088585
21     0.284185
22     0.255054
23     0.326397
24     0.174792
25     0.187872
26     0.187277
27     0.048157
28     0.174792
29     0.073722
30     0.096314
         ...   
914    0.042212
915    0.048157
916    0.634364
917    0.073722
918    0.205113
919    0.447681
920    0.049941
921    0.214031
922    0.254459
923    0.182521
924    0.183115
925    0.059453
926    0.039239
927    0.263377
928    0.089180
929    0.107610
930    0.111177
931    0.134958
932    0.568371
933    0.289536
934    0.382878
935    0.090963
936    0.316290
937    0.080262
938    0.209869
939    0.124257
940    0.219976
941    0.052913
942    0.200357
943    0.340666
dtype: float64

In [132]:
rating_data.timestamp = rating_data.timestamp.apply(lambda x: datetime.datetime.
                                                    fromtimestamp(x).strftime('%Y-%m-%d'))

### Average rating by occupation

In [133]:
user_data = pd.merge(user, rating_data, on='userid')

In [11]:
occupation_rating = user_data.groupby('occupation').mean()['rating']
data = [Bar(x=occupation_rating.index,y=occupation_rating.values)]
layout = Layout(xaxis = dict(title='Occupation'), yaxis=dict(title='Average Rating'))
fig = Figure(data=data, layout=layout)
plot(fig)

'file:///home/user/ml_regression/RecommendationMovieRatings/temp-plot.html'

### Age-wise histogram

In [13]:
#data = [Histogram(x = user.age)]
#layout = Layout(xaxis = dict(title='Age'), yaxis=dict(title='Frequency'))
#fig = Figure(data=data, layout=layout)
data = [Box(y = user.age, boxpoints='all',jitter=0.3,pointpos=-3)]
plot(data)

'file:///home/user/ml_regression/RecommendationMovieRatings/temp-plot.html'

### Rating histogram

In [14]:
data = [Histogram(x = rating_data.rating)]
layout = Layout(xaxis = dict(title='Rating'), yaxis=dict(title='Frequency'))
fig = Figure(data=data, layout=layout)
plot(fig)

'file:///home/user/ml_regression/RecommendationMovieRatings/temp-plot.html'

In [15]:
user_data.head()

Unnamed: 0,userid,age,gender,occupation,zipcode,itemid,rating,timestamp
0,1,24,M,technician,85711,61,4,1997-11-03
1,1,24,M,technician,85711,189,3,1998-03-01
2,1,24,M,technician,85711,33,4,1997-11-03
3,1,24,M,technician,85711,160,4,1997-09-24
4,1,24,M,technician,85711,20,4,1998-02-14


### Average rating by year

In [134]:
year_rating = user_data.groupby('timestamp').mean()['rating']

In [17]:
data = [Scatter(x=year_rating.index,y=year_rating.values)]
layout = Layout(xaxis = dict(title='Year'), yaxis=dict(title='Average Rating'))
fig = Figure(data=data, layout=layout)
plot(fig)

'file:///home/user/ml_regression/RecommendationMovieRatings/temp-plot.html'

In [18]:
bin_edges = stats.mstats.mquantiles(user_data.age, [1/2])
bin_edges

array([ 30.])

### Average rating  by age and gender

In [19]:
user_data.age.tail(10)

99990    22
99991    22
99992    22
99993    22
99994    22
99995    22
99996    22
99997    22
99998    22
99999    22
Name: age, dtype: int64

In [135]:
user_data['AgeGroup'] = user_data.age.apply(lambda i:'{}-{}'.format(10*int(i/10), 10*(int(i/10)+1)))

In [21]:
user_data.dtypes

userid         int64
age            int64
gender        object
occupation    object
zipcode       object
itemid         int64
rating         int64
timestamp     object
AgeGroup      object
dtype: object

In [22]:
age_gender_rating = user_data.groupby(['AgeGroup', 'gender']).mean()['rating']

In [23]:
age_gender_rating = age_gender_rating.unstack().fillna(0)
age_gender_rating

gender,F,M
AgeGroup,Unnamed: 1_level_1,Unnamed: 2_level_1
0-10,0.0,3.767442
10-20,3.552344,3.455969
20-30,3.420305,3.482504
30-40,3.672275,3.511744
40-50,3.548917,3.60841
50-60,3.535937,3.673972
60-70,3.26087,3.652308
70-80,3.239437,3.880952


In [24]:
trace1 = Bar(x=age_gender_rating.index, y=age_gender_rating.F, name='Female')
trace2 = Bar(x=age_gender_rating.index, y=age_gender_rating.M, name='Male')
data = [trace1, trace2]
layout = Layout(xaxis = dict(title='Age'), yaxis=dict(title='Average Rating'), bargroupgap=0.3)
fig =Figure(data=data, layout=layout)
plot(fig)

'file:///home/user/ml_regression/RecommendationMovieRatings/temp-plot.html'

### movie item cosine similarity

In [25]:
#pd.pivot_table(user_data, values='rating', index='userid', columns='itemid')

In [136]:
train_data, test_data = cv.train_test_split(rating_data, test_size=0.25)

In [137]:
n_users = rating_data.userid.unique().size
n_items = rating_data.itemid.unique().size
train_data_matrix = np.zeros((n_users, n_items))
for i in train_data.itertuples():
    train_data_matrix[i[1]-1, i[2]-1] = i[3]

In [157]:
test_data_matrix = np.zeros((n_users, n_items))
for i in test_data.itertuples():
    test_data_matrix[i[1]-1, i[2]-1] = i[3]

In [158]:
test_data_matrix

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [140]:
user_similarity = 1-pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = 1-pairwise_distances(train_data_matrix.T, metric='cosine')

In [141]:
user_similarity[2, 3]

0.30593528750122334

In [142]:
def knn(similarity, k):
    top_ten_similarity = []
    for index, row in enumerate(similarity):
        similarity_grouped = list(zip(range(len(row)), row))
        top_ten = sorted(similarity_grouped, key=lambda l: l[1], reverse=True)
        ids, similarity_score = zip(*top_ten)
        top_ten_similarity.append(list(ids[:k+1]))
    return np.array(top_ten_similarity)


In [143]:
user_ten_similar = knn(user_similarity, k=10)
item_ten_similar = knn(item_similarity, k=10)
user_ten_similar

array([[  0, 863, 434, ..., 221, 513, 737],
       [  1, 568, 767, ..., 412, 525, 103],
       [  2, 316, 528, ..., 862, 672, 919],
       ..., 
       [940, 864, 816, ..., 702, 611, 581],
       [941, 453, 162, ..., 498,  81, 675],
       [942, 708, 932, ..., 486, 441, 642]])

In [34]:
item_ten_similar

array([[   0,   49,  180, ...,  236,  221,   68],
       [   1,  232,  384, ...,  553,  567,  549],
       [   2,  824,  272, ...,   66,   41,  942],
       ..., 
       [1678, 1679,  909, ..., 1233,  360, 1061],
       [1671, 1680, 1422, ...,  764, 1182, 1248],
       [1681,  766, 1334, ...,  959, 1004,   45]])

In [144]:
a = [i[1:]for i in item_ten_similar]
b = [i[1:]for i in user_ten_similar]
items_similar_df = pd.DataFrame({'itemid':range(len(item_ten_similar)),
                                 'similar_itemid':a})
user_similar_df = pd.DataFrame({'userid':range(len(user_ten_similar)),
                                 'similar_userid':b})


In [229]:
def predictions(training_data, type):
    sum1 = 0
    sim = 0
    pred = np.zeros(training_data.shape)
    if type=='user':
        for u in range(training_data.shape[0]):
            print('u', u)
            for i in range(training_data.shape[1]):
                print('i', i)
                sum1 = 0
                sim = 0
                if training_data[u, i] == 0:
                    similar_users = np.array(user_similar_df[user_similar_df.userid== u]['similar_userid'])[0]
                    print(similar_users)
                    for v in range(len(similar_users)):
                        print('v', v)
                        neigh = similar_users[v]
                        similar = user_similarity[u, neigh]
                        sum1 +=  similar * (rating_table.ix[similar_users[v]+1, i+1]-mean_rating_users[similar_users[v]+1])
                         
                        sim += similar
                    pred[u, i]=mean_rating_users[u+1]+(sum1/sim)
        return pred                
    
                    
                                        
                                      
                                      

        

In [230]:
testing_data = test_data_matrix[:2][:].copy() 
prediction_matrix = predictions(testing_data, 'user')
#train_data_matrix

u 0
i 0
[863 434 888 267 386 456 302 221 513 737]
v 0
v 1
v 2
v 3
v 4
v 5
v 6
v 7
v 8
v 9
i 1
[863 434 888 267 386 456 302 221 513 737]
v 0
v 1
v 2
v 3
v 4
v 5
v 6
v 7
v 8
v 9
i 2
[863 434 888 267 386 456 302 221 513 737]
v 0
v 1
v 2
v 3
v 4
v 5
v 6
v 7
v 8
v 9
i 3
[863 434 888 267 386 456 302 221 513 737]
v 0
v 1
v 2
v 3
v 4
v 5
v 6
v 7
v 8
v 9
i 4
i 5
[863 434 888 267 386 456 302 221 513 737]
v 0
v 1
v 2
v 3
v 4
v 5
v 6
v 7
v 8
v 9
i 6
i 7
i 8
[863 434 888 267 386 456 302 221 513 737]
v 0
v 1
v 2
v 3
v 4
v 5
v 6
v 7
v 8
v 9
i 9
[863 434 888 267 386 456 302 221 513 737]
v 0
v 1
v 2
v 3
v 4
v 5
v 6
v 7
v 8
v 9
i 10
[863 434 888 267 386 456 302 221 513 737]
v 0
v 1
v 2
v 3
v 4
v 5
v 6
v 7
v 8
v 9
i 11
[863 434 888 267 386 456 302 221 513 737]
v 0
v 1
v 2
v 3
v 4
v 5
v 6
v 7
v 8
v 9
i 12
i 13
[863 434 888 267 386 456 302 221 513 737]
v 0
v 1
v 2
v 3
v 4
v 5
v 6
v 7
v 8
v 9
i 14
[863 434 888 267 386 456 302 221 513 737]
v 0
v 1
v 2
v 3
v 4
v 5
v 6
v 7
v 8
v 9
i 15
[863 434 888 267 386 456

In [231]:
prediction_matrix[0]

array([ 4.24447716,  2.55790534,  1.05311243, ..., -0.05347171,
       -0.05347171, -0.05347171])

In [241]:
m[0]

'Toy Story (1995)'

In [252]:
movie_predict_user1 = prediction_matrix[0].nonzero()
movieid = []
for i in movie_predict_user1:
    for j in i:
        movieid.append(j)
rating_predicted = prediction_matrix[0][movieid]

movienames = [m[i] for i in movieid]
len(rating_predicted)

1605

In [255]:
prediction_df = pd.DataFrame({'itemid': movieid, 'movie':movienames, 'rating_predicted':rating_predicted})

In [261]:
prediction_df.sort_values(by='rating_predicted', ascending=False)

Unnamed: 0,itemid,movie,rating_predicted
121,172,"Princess Bride, The (1987)",4.745762
66,95,Terminator 2: Judgment Day (1991),4.745391
120,171,"Empire Strikes Back, The (1980)",4.550194
279,356,One Flew Over the Cuckoo's Nest (1975),4.349333
8,11,"Usual Suspects, The (1995)",4.259876
117,167,Monty Python and the Holy Grail (1974),4.250062
137,194,"Terminator, The (1984)",4.246165
0,0,Toy Story (1995),4.244477
125,182,Alien (1979),4.146004
149,209,Indiana Jones and the Last Crusade (1989),4.141217


In [200]:
test_data_matrix[0].nonzero()

(array([  4,   6,   7,  12,  17,  18,  28,  29,  32,  35,  36,  39,  42,
         45,  48,  49,  53,  55,  57,  62,  65,  67,  72,  75,  77,  78,
         84,  88,  89,  97,  98,  99, 105, 109, 110, 114, 122, 123, 129,
        132, 140, 143, 145, 147, 150, 156, 157, 159, 160, 164, 168, 173,
        175, 176, 178, 180, 181, 196, 199, 208, 215, 216, 217, 221, 230,
        231, 234, 236, 241, 243, 245, 252, 257, 262, 265, 267, 269]),)

In [None]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [None]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [None]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

print ('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print ('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

In [207]:
movies.columns=['itemid', 'movie']

In [208]:
m = movies['movie']
m

0                                        Toy Story (1995)
1                                        GoldenEye (1995)
2                                       Four Rooms (1995)
3                                       Get Shorty (1995)
4                                          Copycat (1995)
5       Shanghai Triad (Yao a yao yao dao waipo qiao) ...
6                                   Twelve Monkeys (1995)
7                                             Babe (1995)
8                                 Dead Man Walking (1995)
9                                      Richard III (1995)
10                                   Seven (Se7en) (1995)
11                             Usual Suspects, The (1995)
12                                Mighty Aphrodite (1995)
13                                     Postino, Il (1994)
14                              Mr. Holland's Opus (1995)
15                     French Twist (Gazon maudit) (1995)
16                             From Dusk Till Dawn (1996)
17            

In [92]:
rating_data[rating_data.userid==1]
rating_data['movie_name'] = rating_data.itemid.apply(lambda x:m[x-1])