In [2]:
!pip install bottleneck

Collecting bottleneck
[?25l  Downloading https://files.pythonhosted.org/packages/05/ae/cedf5323f398ab4e4ff92d6c431a3e1c6a186f9b41ab3e8258dff786a290/Bottleneck-1.2.1.tar.gz (105kB)
[K    100% |████████████████████████████████| 112kB 2.4MB/s 
Building wheels for collected packages: bottleneck
  Running setup.py bdist_wheel for bottleneck ... [?25l- \ | / - \ | / done
[?25h  Stored in directory: /root/.cache/pip/wheels/f2/bf/ec/e0f39aa27001525ad455139ee57ec7d0776fe074dfd78c97e4
Successfully built bottleneck
Installing collected packages: bottleneck
Successfully installed bottleneck-1.2.1


In [0]:
import os
import shutil
import sys

import numpy as np
from scipy import sparse

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sn
sn.set()

import pandas as pd

import tensorflow as tf
from tensorflow.contrib.layers import apply_regularization, l2_regularizer

import bottleneck as bn

In [5]:
!wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

--2018-12-15 11:02:01--  http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.34.235
Connecting to files.grouplens.org (files.grouplens.org)|128.101.34.235|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2018-12-15 11:02:02 (1.52 MB/s) - ‘ml-latest-small.zip’ saved [978202/978202]



In [6]:
!ls -hl

total 964K
-rw-r--r-- 1 root root 956K Sep 26 20:59 ml-latest-small.zip
drwxr-xr-x 1 root root 4.0K Dec 10 17:34 sample_data


In [7]:
!mkdir data
!cd data && unzip ../ml-latest-small.zip
!ls -hl data/ml-latest-small/

Archive:  ../ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  
total 3.2M
-rw-r--r-- 1 root root 194K Sep 26 20:50 links.csv
-rw-r--r-- 1 root root 483K Sep 26 20:49 movies.csv
-rw-r--r-- 1 root root 2.4M Sep 26 20:49 ratings.csv
-rw-r--r-- 1 root root 8.2K Sep 26 20:50 README.txt
-rw-r--r-- 1 root root 116K Sep 26 20:49 tags.csv


In [8]:
!head data/ml-latest-small/ratings.csv

userId,movieId,rating,timestamp
1,1,4.0,964982703
1,3,4.0,964981247
1,6,4.0,964982224
1,47,5.0,964983815
1,50,5.0,964982931
1,70,3.0,964982400
1,101,5.0,964980868
1,110,4.0,964982176
1,151,5.0,964984041


In [0]:
### change `DATA_DIR` to the location where movielens-20m dataset sits
DATA_DIR = 'data/ml-latest-small/'
raw_data = pd.read_csv(os.path.join(DATA_DIR, 'ratings.csv'), header=0)

In [29]:
raw_data[raw_data.rating < 2].head()

Unnamed: 0,userId,movieId,rating,timestamp
205,1,3176,1.0,964983504
261,3,31,0.5,1306463578
262,3,527,0.5,1306464275
263,3,647,0.5,1306463619
264,3,688,0.5,1306464228


In [30]:
len(raw_data)

100836

In [0]:
## Let's remove ratings lower than 4
raw_data = raw_data[raw_data.rating > 3.5]

In [32]:
len(raw_data)

48580

In [0]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

In [0]:
def filter_triplets(tp, min_uc=5, min_sc=0):
    # Only keep the triplets for items which were clicked on by at least min_sc users. 
    if min_sc > 0:
        itemcount = get_count(tp, 'movieId')
        tp = tp[tp['movieId'].isin(itemcount.index[itemcount >= min_sc])]
    
    # Only keep the triplets for users who clicked on at least min_uc items
    # After doing this, some of the items will have less than min_uc users, but should only be a small proportion
    if min_uc > 0:
        usercount = get_count(tp, 'userId')
        tp = tp[tp['userId'].isin(usercount.index[usercount >= min_uc])]
    
    # Update both usercount and itemcount after filtering
    usercount, itemcount = get_count(tp, 'userId'), get_count(tp, 'movieId') 
    return tp, usercount, itemcount

In [0]:
raw_data, user_activity, item_popularity = filter_triplets(raw_data)

In [36]:
sparsity = 1. * raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])

print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" % 
      (raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))

After filtering, there are 48562 watching events from 603 users and 6298 movies (sparsity: 1.279%)


In [37]:
len(raw_data)

48562

In [0]:
unique_uid = user_activity.index

np.random.seed(98765)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]

In [0]:
# create train/validation/test users
n_users = unique_uid.size
n_heldout_users = 100

tr_users = unique_uid[:(n_users - n_heldout_users * 2)]
vd_users = unique_uid[(n_users - n_heldout_users * 2): (n_users - n_heldout_users)]
te_users = unique_uid[(n_users - n_heldout_users):]

In [0]:
train_plays = raw_data.loc[raw_data['userId'].isin(tr_users)]

In [0]:
unique_sid = pd.unique(train_plays['movieId'])
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

In [0]:
def numerize(tp):
    uid = list(map(lambda x: profile2id[x], tp['userId']))
    sid = list(map(lambda x: show2id[x], tp['movieId']))
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

In [0]:
train_data = numerize(train_plays)

In [54]:
train_data.head(5)

Unnamed: 0,uid,sid
0,45,0
1,45,1
2,45,2
3,45,3
4,45,4


In [0]:
train_data.to_csv('train.csv', index=False)

In [0]:
n_users = len(train_data.uid.unique())
n_items = len(train_data.sid.unique())

In [56]:
n_users, n_items

(403, 5181)

In [0]:
def load_train_data(csv_file):
    tp = pd.read_csv(csv_file)
    n_users = tp['uid'].max() + 1

    rows, cols = tp['uid'], tp['sid']
    data = sparse.csr_matrix((np.ones_like(rows),
                             (rows, cols)), dtype='float64',
                             shape=(n_users, n_items))
    return data

In [0]:
train_data = load_train_data('train.csv')

In [60]:
train_data.toarray()[0:5,0:10]

array([[0., 0., 1., 1., 1., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 1., 0., 0., 0., 0.],
       [1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [0]:
from tensorflow import keras

In [0]:
from keras import optimizers
from keras import models
from keras import layers
from keras import metrics

import tensorflow as tf

In [0]:
m = models.Sequential()

### Capas

tf.losses.

m.compile(optimizers=...., loss=tf.losses...)

m.fit...