In [307]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from plotly.offline import plot
from plotly.graph_objs import *
import datetime
from sklearn import cross_validation as cv
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy import stats, linalg

In [308]:
user = pd.read_csv('data/u.user', delimiter='|', header=None, names=['userid', 'age', 'gender', 'occupation', 'zipcode'])
rating_data = pd.read_csv('data/u.data', delimiter='\t', header=None, names=['userid', 'itemid', 'rating', 'timestamp'])

In [309]:
movies = pd.read_csv('data/u.item', delimiter='|', encoding='ISO-8859-1', header=None, usecols=[0, 1])

In [310]:
rating_data.timestamp = rating_data.timestamp.apply(lambda x: datetime.datetime.
                                                    fromtimestamp(x).strftime('%Y-%m-%d'))

### Average rating by occupation

In [311]:
user_data = pd.merge(user, rating_data, on='userid')

In [11]:
occupation_rating = user_data.groupby('occupation').mean()['rating']
data = [Bar(x=occupation_rating.index,y=occupation_rating.values)]
layout = Layout(xaxis = dict(title='Occupation'), yaxis=dict(title='Average Rating'))
fig = Figure(data=data, layout=layout)
plot(fig)

'file:///home/user/ml_regression/RecommendationMovieRatings/temp-plot.html'

### Age-wise histogram

In [264]:
#data = [Histogram(x = user.age)]
#layout = Layout(xaxis = dict(title='Age'), yaxis=dict(title='Frequency'))
#fig = Figure(data=data, layout=layout)
data = [Box(y = user.age, boxpoints='all',jitter=0.3,pointpos=-3)]
plot(data)

'file:///home/user/ml_regression/RecommendationMovieRatings/temp-plot.html'

### Rating histogram

In [14]:
data = [Histogram(x = rating_data.rating)]
layout = Layout(xaxis = dict(title='Rating'), yaxis=dict(title='Frequency'))
fig = Figure(data=data, layout=layout)
plot(fig)

'file:///home/user/ml_regression/RecommendationMovieRatings/temp-plot.html'

In [312]:
user_data.head()

Unnamed: 0,userid,age,gender,occupation,zipcode,itemid,rating,timestamp
0,1,24,M,technician,85711,61,4,1997-11-03
1,1,24,M,technician,85711,189,3,1998-03-01
2,1,24,M,technician,85711,33,4,1997-11-03
3,1,24,M,technician,85711,160,4,1997-09-24
4,1,24,M,technician,85711,20,4,1998-02-14


### Average rating by year

In [313]:
year_rating = user_data.groupby('timestamp').mean()['rating']

In [17]:
data = [Scatter(x=year_rating.index,y=year_rating.values)]
layout = Layout(xaxis = dict(title='Year'), yaxis=dict(title='Average Rating'))
fig = Figure(data=data, layout=layout)
plot(fig)

'file:///home/user/ml_regression/RecommendationMovieRatings/temp-plot.html'

### Average rating  by age and gender

In [314]:
user_data['AgeGroup'] = user_data.age.apply(lambda i:'{}-{}'.format(10*int(i/10), 10*(int(i/10)+1)))

In [315]:
age_gender_rating = user_data.groupby(['AgeGroup', 'gender']).mean()['rating']

In [316]:
age_gender_rating = age_gender_rating.unstack().fillna(0)
age_gender_rating

gender,F,M
AgeGroup,Unnamed: 1_level_1,Unnamed: 2_level_1
0-10,0.0,3.767442
10-20,3.552344,3.455969
20-30,3.420305,3.482504
30-40,3.672275,3.511744
40-50,3.548917,3.60841
50-60,3.535937,3.673972
60-70,3.26087,3.652308
70-80,3.239437,3.880952


In [317]:
trace1 = Bar(x=age_gender_rating.index, y=age_gender_rating.F, name='Female')
trace2 = Bar(x=age_gender_rating.index, y=age_gender_rating.M, name='Male')
data = [trace1, trace2]
layout = Layout(xaxis = dict(title='Age'), yaxis=dict(title='Average Rating'), bargroupgap=0.3)
fig =Figure(data=data, layout=layout)
plot(fig)

'file:///home/user/ml_regression/RecommendationMovieRatings/temp-plot.html'

### movie item cosine similarity

In [318]:
train_data, test_data = cv.train_test_split(rating_data, test_size=0.25)

In [319]:
n_users = rating_data.userid.unique().size
n_items = rating_data.itemid.unique().size
train_data_matrix = np.zeros((n_users, n_items))
for i in train_data.itertuples():
    train_data_matrix[i[1]-1, i[2]-1] = i[3]

In [320]:
test_data_matrix = np.zeros((n_users, n_items))
for i in test_data.itertuples():
    test_data_matrix[i[1]-1, i[2]-1] = i[3]

In [321]:
user_similarity = 1-pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = 1-pairwise_distances(train_data_matrix.T, metric='cosine')

In [322]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [323]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [324]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

print ('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print ('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 2.9640864069625428
Item-based CF RMSE: 3.1718404898714523
