In [3]:
import numpy as np
import pandas as pd

import graphlab

# Datasets:

MovieLens + IMDb/Rotten Tomatoes

---

### Description

---

This dataset is an extension of MovieLens10M dataset, published by GroupLeans 
research group.

http://www.grouplens.org 

It links the movies of MovieLens dataset with their corresponding web pages at 
Internet Movie Database (IMDb) and Rotten Tomatoes movie review systems.

http://www.imdb.com 

http://www.rottentomatoes.com 

From the original dataset, only those users with both rating and tagging information 
have been mantained.

---

### Data statistics

---
<table>
<tr style="border-bottom: 2pt solid black;"><th>count</th><th>name</th></tr>
<tr><td>2113</td><td>users</td> </tr>
<tr style="border-bottom: 2pt solid black;"><td>10197</td><td>movies</td></tr>

<tr><td>20</td><td>movie genres</td></tr>
<tr><td>20809</td><td>movie genre assignments</td> </tr>
<tr style="border-bottom: 2pt solid black;"><td></td><td>avg. 2.040 genres per movie</td></tr>

<tr><td>4060</td><td>directors</td></tr>
<tr><td>95321</td><td>actors</td></tr>
<tr><td></td><td>avg. 22.778 actors per movie</td></tr>
<tr style="border-bottom: 2pt solid black;"><td>72</td><td>countries</td></tr>

<tr><td>10197</td><td>country assignments</td></tr>
<tr><td></td><td>avg. 1.000 countries per movie</td></tr>
<tr><td>47899</td><td>location assignments</td></tr>
<tr style="border-bottom: 2pt solid black;"><td></td><td>avg. 5.350 locations per movie</td></tr>

<tr><td>13222</td><td>tags</td></tr>
<tr><td>47957</td><td>tag assignments (tas), i.e. tuples [user, tag, movie]</td></tr>
<tr><td></td><td>avg. 22.696 tas per user</td></tr>
<tr style="border-bottom: 2pt solid black;"><td></td><td>avg. 8.117 tas per movie</td></tr>

<tr><td>855598</td><td>ratings</td></tr>
<tr><td></td><td>avg. 404.921 ratings per user</td></tr>
<tr style="border-bottom: 2pt solid black;"><td></td><td>avg. 84.637 ratings per movie</td></tr>

</table>

\begin{array}
-R & = & R^u \cup R_{unknown} \\
R^u & = & R^u_{train} \cup R^u_{test}
\end{array}

### Metrics:

\begin{array}
-RMSE & =  &\sqrt{\frac{1}{|R_{test}|}\sum_{(u, i) \in R_{test}} (r_{ui} - \widehat{r}_{ui})^2} \\
MAP & = & \frac{\sum_{u \in U} AP^u}{|U|},
\end{array}
где $AP^u = \frac{1}{|Rel^u|} \sum_{(u, i) \in Rel^u} \frac{1}{k_i^u}$

In [16]:
def rmse_metric(R, test):
    value = 0
    for (user_id, movie_id), row in test.iterrows():
        try:
            cur_value = (row['rating'] - R.loc[user_id][movie_id]) ** 2
        except:
            cur_value = 0
        value += cur_value
    value /= test.shape[0]
    return np.sqrt(value)

def map_metric(R, test):
    def ap_metric(user_id, R, test):
        movie_ratings = R.ix[user_id]
        movie_indexs = list(R.ix[user_id].sort_values(ascending=False).index)
        value, count_rel = 0., 0
        for movie_id, _ in test.ix[user_id].iterrows():
            try:
                rating = movie_ratings.loc[movie_id]
            except:
                rating = 0

            if rating >= 3:
                k = movie_indexs.index(movie_id) + 1
                value += 1. / k
                count_rel += 1
        return value / count_rel if count_rel > 0 else 0
    
    value, count_rel = 0., 0
    for user_id in R.index:
        t = ap_metric(user_id, R, test)
        if t > 0:
            value += t
            count_rel += 1
    
    return value / count_rel if count_rel > 0 else 0

# Коллаборативная фильтрация

In [5]:
data = pd.read_table('data/user_ratedmovies-timestamps.dat', encoding='cp1251', index_col=['userID', 'movieID'])
data.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,rating,timestamp
userID,movieID,Unnamed: 2_level_1,Unnamed: 3_level_1
75,3,1.0,1162160236000
75,32,4.5,1162160624000
75,110,4.0,1162161008000
75,160,2.0,1162160212000
75,163,4.0,1162160970000


In [6]:
in_train = np.array([True] * data.shape[0])

prev_user_id = None
for i, (index, row) in enumerate(data.iterrows()):
    user_id, movie_id = index
    if prev_user_id != user_id:
        prev_user_id = user_id
        border_timestamp = data.ix[user_id].sort_values('timestamp').iloc[-3]['timestamp']
    
    timestamp = row['timestamp']
    if timestamp >= border_timestamp:
        in_train[i] = False

train = data[in_train]
test = data[np.logical_not(in_train)]
test.head(6)

Unnamed: 0_level_0,Unnamed: 1_level_0,rating,timestamp
userID,movieID,Unnamed: 2_level_1,Unnamed: 3_level_1
75,2571,4.5,1162161050000
75,5952,3.5,1162161040000
75,7153,3.5,1162161036000
78,8400,4.5,1177224301000
78,44694,2.0,1179550301000
78,50872,4.5,1188713380000


In [7]:
R = pd.pivot_table(train.reset_index(), 
                   columns='movieID', 
                   index='userID', 
                   values='rating', 
                   aggfunc=np.mean, 
                   fill_value=0)
R.shape

(2113, 10084)

# 1. Most popular method

In [8]:
R1 = R.copy(deep=True)
values = R1.apply(np.mean, axis=0).sort_values(ascending=False)

for user_id, row in R1.iterrows():
    for movie_id, rating in R1.loc[user_id].iteritems():
        if rating == 0:
            R1.set_value(user_id, movie_id, values.loc[movie_id])

R1.head()

movieID,1,2,3,4,5,6,7,8,9,10,...,64983,64986,64990,64993,64997,64999,65006,65037,65088,65091
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
75,2.212967,1.074539,1.0,0.053242,0.292239,1.06673,0.421912,0.050166,0.068386,1.147184,...,0.005206,0,0.003549,0,0.006862,0.000237,0,0.00284,0.001656,0
78,2.212967,1.074539,0.340748,0.053242,0.292239,1.06673,0.421912,0.050166,0.068386,1.147184,...,0.005206,0,0.003549,0,0.006862,0.000237,0,0.00284,0.001656,0
127,2.212967,1.074539,0.340748,0.053242,0.292239,1.06673,0.421912,0.050166,0.068386,1.147184,...,0.005206,0,0.003549,0,0.006862,0.000237,0,0.00284,0.001656,0
170,3.0,2.0,0.340748,0.053242,0.292239,1.06673,0.421912,0.050166,0.068386,3.5,...,0.005206,0,0.003549,0,0.006862,0.000237,0,0.00284,0.001656,0
175,4.0,1.074539,0.340748,0.053242,0.292239,5.0,0.421912,0.050166,0.068386,1.147184,...,0.005206,0,0.003549,0,0.006862,0.000237,0,0.00284,0.001656,0


In [17]:
map_metric(R1, test)

0.033278868022330835

# 2. Item-based method

In [18]:
model1 = graphlab.recommender.item_similarity_recommender.create(graphlab.SFrame(data=train[['rating']].reset_index()),
                                                                 user_id='userID',
                                                                 item_id='movieID',
                                                                 target='rating',
                                                                 similarity_type='pearson')

This non-commercial license of GraphLab Create for academic use is assigned to fpm.yunusov@bsu.by and will expire on May 02, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1493801037.log


In [19]:
result = model1.predict(graphlab.SFrame(data=test.reset_index()))

In [29]:
R2 = R.copy(deep=True)
values = test.copy()
values['new_rating'] = result

for user_id, row in R2.iterrows():
    for movie_id, rating in R2.loc[user_id].iteritems():
        if rating == 0:
            try:
                value = values.loc[user_id, movie_id]['new_rating']
            except:
                value = 0.0
            R2.set_value(user_id, movie_id, value)
    break

R2.head()

movieID,1,2,3,4,5,6,7,8,9,10,...,64983,64986,64990,64993,64997,64999,65006,65037,65088,65091
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
75,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0.0,0,0.0,0.0,0,0.0,0.0,0
78,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0.0,0,0.0,0.0,0,0.0,0.0,0
127,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0.0,0,0.0,0.0,0,0.0,0.0,0
170,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,...,0.0,0,0.0,0,0.0,0.0,0,0.0,0.0,0
175,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0,0.0,0,0.0,0.0,0,0.0,0.0,0


In [31]:
values.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,rating,timestamp,new_rating
userID,movieID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
75,2571,4.5,1162161050000,4.179417
75,5952,3.5,1162161040000,4.024876
75,7153,3.5,1162161036000,4.088363
78,8400,4.5,1177224301000,3.658329
78,44694,2.0,1179550301000,3.859334
78,50872,4.5,1188713380000,3.990663
127,6013,3.0,1190232446000,1.89792
127,6958,4.0,1190232418000,2.453377
127,30883,2.5,1190232402000,1.980769
170,4251,4.0,1180699501000,3.608696


In [30]:
print('MAP: %f' % map_metric(R2, test))
print('RMSE: %f' % rmse_metric(R2, test))

MAP: 0.066865
RMSE: 3.730742


# Разреженный SVD