In [1]:
import pandas as pd
from collections import Counter
import tensorflow as tf
from tffm import TFFMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np
import pprint

Import csv files into dataframes:

In [2]:
ratings=pd.read_csv('~/ml-20m/ratings.csv', names=['userid', 'movieid','rating', 'timestamp'], skiprows=1)
genres2=pd.read_csv('~/ml-20m/movies.csv', names=['movieid', 'movienm', 'genreid'], skiprows=1)

Create dictionary of movie id's with their genres so it can be mapped to ratings dataframe and used as context

In [3]:
dictionary=dict(zip(genres2.movieid,genres2.genreid))

In [4]:
dictionary

{1: 'Adventure|Animation|Children|Comedy|Fantasy',
 2: 'Adventure|Children|Fantasy',
 3: 'Comedy|Romance',
 4: 'Comedy|Drama|Romance',
 5: 'Comedy',
 6: 'Action|Crime|Thriller',
 7: 'Comedy|Romance',
 8: 'Adventure|Children',
 9: 'Action',
 10: 'Action|Adventure|Thriller',
 11: 'Comedy|Drama|Romance',
 12: 'Comedy|Horror',
 13: 'Adventure|Animation|Children',
 14: 'Drama',
 15: 'Action|Adventure|Romance',
 16: 'Crime|Drama',
 17: 'Drama|Romance',
 18: 'Comedy',
 19: 'Comedy',
 20: 'Action|Comedy|Crime|Drama|Thriller',
 21: 'Comedy|Crime|Thriller',
 22: 'Crime|Drama|Horror|Mystery|Thriller',
 23: 'Action|Crime|Thriller',
 24: 'Drama|Sci-Fi',
 25: 'Drama|Romance',
 26: 'Drama',
 27: 'Children|Drama',
 28: 'Drama|Romance',
 29: 'Adventure|Drama|Fantasy|Mystery|Sci-Fi',
 30: 'Crime|Drama',
 31: 'Drama',
 32: 'Mystery|Sci-Fi|Thriller',
 33: 'Adventure|Romance|IMAX',
 34: 'Children|Drama',
 35: 'Drama|Romance',
 36: 'Crime|Drama',
 37: 'Documentary|IMAX',
 38: 'Children|Comedy',
 39: 'Comedy

In [5]:
ratings['genres']=ratings['movieid'].map(dictionary)

In [6]:
ratings['genres'] = ratings.genres.map(lambda x: x.split('|'))

Unnest the genres column so there is a row for each movie-genre pair

In [7]:
def unnest(df, col, reset_index=True):
    col_flat = pd.DataFrame([[i, x] 
                       for i, y in df[col].apply(list).iteritems() 
                           for x in y], columns=['I', col])
    col_flat = col_flat.set_index('I')
    df = df.drop(col, 1)
    df = df.merge(col_flat, left_index=True, right_index=True)
    if reset_index:
        df = df.reset_index(drop=True)
    return df

ratings=unnest(ratings, 'genres')

ratings.head()

Unnamed: 0,userid,movieid,rating,timestamp,genres
0,1,2,3.5,1112486027,Adventure
1,1,2,3.5,1112486027,Children
2,1,2,3.5,1112486027,Fantasy
3,1,29,3.5,1112484676,Adventure
4,1,29,3.5,1112484676,Drama


In [8]:
#ratings.to_csv('tffm_df.csv')

Drop timestamp column:

In [9]:
ratings.set_index('userid', inplace=True)
ratings=ratings.drop('timestamp', 1)

** Using ONLY 10 most common users (for now) because it is so computationally expensive **

In [10]:
x=Counter(ratings.index).most_common(10)
top_k=dict(x).keys()
ratings=ratings[ratings.index.isin(top_k)]

In [11]:
ratings.shape

(133214, 3)

In [12]:
ratings.head()

Unnamed: 0_level_0,movieid,rating,genres
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8405,1,5.0,Adventure
8405,1,5.0,Animation
8405,1,5.0,Children
8405,1,5.0,Comedy
8405,1,5.0,Fantasy


create '_userid' column:

In [13]:
ratings['_userid']=ratings.index

In [14]:
ratings.head()

Unnamed: 0_level_0,movieid,rating,genres,_userid
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8405,1,5.0,Adventure,8405
8405,1,5.0,Animation,8405
8405,1,5.0,Children,8405
8405,1,5.0,Comedy,8405
8405,1,5.0,Fantasy,8405


Convert movieid and genre to 'category' so they can be one-hot encoded:

In [15]:
ratings['movieid']=ratings['movieid'].astype('category')
ratings['genres']=ratings['genres'].astype('category')
ratings['_userid']=ratings['_userid'].astype('category')

** Use Pandas' get_dummies to one-hot-encode the genres and movie ID for each user: **



**takes about 1min to do the next cell (when using 30 most common users) **

In [16]:
trans_ratings=pd.get_dummies(ratings)

In [17]:
trans_ratings['userid2']=trans_ratings.index

In [18]:
trans_ratings.shape

(133214, 16191)

In [19]:
trans_ratings.head()

Unnamed: 0_level_0,rating,movieid_1,movieid_2,movieid_3,movieid_4,movieid_5,movieid_6,movieid_7,movieid_8,movieid_9,...,_userid_34576,_userid_59477,_userid_74142,_userid_79159,_userid_82418,_userid_118205,_userid_121535,_userid_125794,_userid_131904,userid2
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8405,5.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8405
8405,5.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8405
8405,5.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8405
8405,5.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8405
8405,5.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8405


Set trans_ratings to 'df' for simplicity purposes

In [20]:
#trans_ratings.to_csv('trans_rating_10.csv')

In [21]:
df=trans_ratings

In [22]:
#df.drop(['_userid'], axis=1, inplace=True)

Set X to everything in dataframe (except rating), set y to 'rating'

In [23]:
X=df.drop(['rating'], axis=1, inplace=False)
y=np.array(df['rating'].as_matrix())

In [24]:
#X = np.array(X)

In [25]:
#X = np.nan_to_num(X)

** test, train, split **

In [26]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2)

** X_tr and X_te both contain userId, need to create new df's that don't include this so I can be run into model **

In [27]:
X_tr.head()

Unnamed: 0_level_0,movieid_1,movieid_2,movieid_3,movieid_4,movieid_5,movieid_6,movieid_7,movieid_8,movieid_9,movieid_10,...,_userid_34576,_userid_59477,_userid_74142,_userid_79159,_userid_82418,_userid_118205,_userid_121535,_userid_125794,_userid_131904,userid2
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
131904,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,131904
59477,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,59477
34576,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,34576
8405,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8405
79159,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,79159


In [28]:
X_train_withoutUsers=X_tr.drop(['userid2'], axis=1, inplace=False)
X_test_withoutUsers=X_te.drop(['userid2'], axis=1, inplace=False)

** create np.array from X_train_withoutUsers **

In [29]:
X_train_withoutUsersArray=np.array(X_train_withoutUsers)
X_test_withoutUsersArray=np.array(X_test_withoutUsers)

** Run model **

In [30]:
model = TFFMRegressor(
    order=2,
    rank=7,
    optimizer=tf.train.AdamOptimizer(learning_rate=0.1),
    n_epochs=30,
    batch_size=-1,
    init_std=0.001,
    input_type='dense'
)

In [31]:
model.fit(X_train_withoutUsersArray, y_tr, show_progress=True)
predictions = model.predict(X_test_withoutUsersArray)
print('MSE: {}'.format(mean_squared_error(y_te, predictions)))

100%|██████████| 30/30 [13:24<00:00, 26.81s/epoch]


MSE: 0.40096736152587975


## Make predictions:

(this is the messy part - very difficult to predict new movies using such a sparse array)

Checking out how many unique users there are:

In [32]:
ratings._userid.unique()

[8405, 34576, 59477, 74142, 79159, 82418, 118205, 121535, 125794, 131904]
Categories (10, int64): [8405, 34576, 59477, 74142, ..., 118205, 121535, 125794, 131904]

Create DataFrame consisting only of user 125794:

In [33]:
test_df_125794=pd.DataFrame(X[X['userid2']==125794])

In [34]:
test_df_125794.drop('userid2', axis=1, inplace=True)

Create list of unwatched movies for user125794:

In [35]:
filtered=test_df_125794.filter(regex="movie.*")

In [36]:
unwatched_user125794=list(filtered.columns[(filtered == 0).all()])

Create DataFrame consisting only of user 82418: 

In [48]:
test_df_82418=pd.DataFrame(X[X['userid2']==82418])

In [49]:
test_df_82418.drop('userid2', axis=1, inplace=True)

Create list of unwatched movies for user 82418:

In [50]:
filtered=test_df_82418.filter(regex="movie.*")

In [51]:
unwatched_user82418=list(filtered.columns[(filtered == 0).all()])

Create dataframe consisting only of user 118205 (one of the most active users):

In [37]:
test_df_118205=pd.DataFrame(X[X['userid2']==118205])

In [38]:
test_df_118205.drop(['userid2'], axis=1, inplace=True)

In [39]:
test_df_118205.head()

Unnamed: 0_level_0,movieid_1,movieid_2,movieid_3,movieid_4,movieid_5,movieid_6,movieid_7,movieid_8,movieid_9,movieid_10,...,_userid_8405,_userid_34576,_userid_59477,_userid_74142,_userid_79159,_userid_82418,_userid_118205,_userid_121535,_userid_125794,_userid_131904
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
118205,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
118205,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
118205,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
118205,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
118205,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


User 118205 ---> at postition [-4] in arrays

User 125794 ---> at position [-2] in arrays ==== no good with 118205

User 82418 ---> at position [-5] in the arrays === no good with 118205

User 125794 with 82418 ----> [-2] needs to be turned off, [-5] turned on

In [40]:

#temporary=np.array(test_df_118205.loc[test_df_118205['movieid_3']==1])

** Creating dictionary to map movie id back to title: **

In [41]:
dictionary2=dict(zip(genres2.movieid,genres2.movienm))

In [60]:
predictList=[]
moviesList=[]

for i in range(1,1000):
    ###create numpy matrix using movieid:
    temp1=np.array(test_df_125794.loc[test_df_125794[unwatched_user82418[i]]==1])
    if len(temp1) != 0:
        for each in temp1:
            each[-2]=0
            each[-5]=1
        pred=model.predict(temp1)
        predictList.append(np.average(pred))
        moviesList.append(int(unwatched_user82418[i].split('_')[1]))

In [61]:
sanitycheck=list(zip(predictList,moviesList))
sanitycheck.sort(reverse=True)

In [62]:
sanitycheck

[(6.3760576, 2644),
 (5.4335232, 2459),
 (5.3093514, 2660),
 (5.0822992, 1477),
 (5.0349193, 2175),
 (5.0039158, 846),
 (5.0037074, 2743),
 (4.9885402, 841),
 (4.9528494, 2711),
 (4.9525871, 2744),
 (4.9362311, 1415),
 (4.8999758, 1355),
 (4.8732224, 874),
 (4.853281, 1241),
 (4.8459864, 362),
 (4.840693, 2159),
 (4.8343897, 1310),
 (4.8170509, 2440),
 (4.8055592, 1508),
 (4.787127, 1756),
 (4.7777824, 2361),
 (4.7729301, 1365),
 (4.754024, 1123),
 (4.6556993, 1750),
 (4.6451874, 879),
 (4.6438189, 2168),
 (4.6422453, 809),
 (4.6369333, 1695),
 (4.6310458, 2384),
 (4.6285925, 2625),
 (4.6063714, 786),
 (4.6034155, 2057),
 (4.5907888, 2425),
 (4.5877218, 735),
 (4.5303974, 2570),
 (4.520968, 8),
 (4.49933, 993),
 (4.4980044, 1725),
 (4.4943309, 1049),
 (4.4732833, 707),
 (4.439157, 1769),
 (4.4026198, 1181),
 (4.374506, 388),
 (4.3520002, 2606),
 (4.3515549, 335),
 (4.3403654, 2557),
 (4.3279161, 998),
 (4.3266649, 2493),
 (4.3175659, 367),
 (4.2943854, 1722),
 (4.2796278, 2618),
 (4.26

In [64]:
for i in range(0,50):
    print(dictionary2[sanitycheck[i][1]])

Dracula (1931)
Texas Chainsaw Massacre, The (1974)
Thing from Another World, The (1951)
Love Jones (1997)
Déjà Vu (1997)
Flirt (1995)
Native Son (1986)
Eyes Without a Face (Yeux sans visage, Les) (1959)
My Life So Far (1999)
Otello (1986)
Thieves (Voleurs, Les) (1996)
Nightwatch (1997)
Killer: A Journal of Murder (1995)
Dead Alive (Braindead) (1992)
Jungle Book, The (1994)
Henry: Portrait of a Serial Killer (1986)
Hype! (1996)
Another Day in Paradise (1998)
Traveller (1997)
Prophecy II, The (1998)
Pink Flamingos (1972)
Ridicule (1996)
Perfect Candidate, A (1996)
Star Kid (1997)
Relic, The (1997)
Dance with Me (1998)
Fled (1996)
Artemisia (1997)
Babe: Pig in the City (1998)
Black Mask (Hak hap) (1996)
Eraser (1996)
Incredible Journey, The (1963)
General, The (1998)
Cemetery Man (Dellamorte Dellamore) (1994)
Walk on the Moon, A (1999)
Tom and Huck (1995)
Infinity (1996)
Education of Little Tree, The (1997)
Ghost and the Darkness, The (1996)
Mulholland Falls (1996)
Replacement Killers, Th