In [None]:
from math import sqrt

import pandas as pd

import json
import numpy as np

from google.colab import drive
drive.mount('/content/drive/')

# Pandas config
def pandas_config():
    # display 10 rows and all the columns
    pd.set_option('display.max_rows', 20)
    pd.set_option('display.max_columns', None)

    
pandas_config()

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Load Data
Plot keywords and metadata from IMDB, ratings from the movielens data. Dataset located here:
https://www.kaggle.com/rounakbanik/the-movies-dataset?select=movies_metadata

movieclub friends' data collected weekly in a second dataset to add to the movielens rating data for personalized predictions. 


In [None]:
ratings = pd.read_csv('/content/drive/MyDrive/online learning/movierecs/data/ratings.csv')
movieclub = pd.read_csv('/content/drive/MyDrive/online learning/movierecs/data/movieclub.csv')
keywords = pd.read_csv('/content/drive/MyDrive/online learning/movierecs/data/keywords.csv')
movie_metadata = pd.read_csv('/content/drive/MyDrive/online learning/movierecs/data/movies_metadata.csv')

  interactivity=interactivity, compiler=compiler, result=result)


## Data preparation

Prepare ratings, movieclub and movies data to be merged

In [None]:
ratings=ratings[["userId", "movieId", "rating"]]
movieclub=movieclub[["userId", "movieId", "rating"]].dropna() 
movieclub['movieId'] = movieclub['movieId'].astype(int)
movieclub.head()

movies=movie_metadata[['id', 'title']].dropna() 
movies = movies.rename(columns={'id': 'movieId'})
movies['movieId'] = movies['movieId'].astype(int)
movies.head()

# test
# movies.loc[movies['movieId'] == 480]

Unnamed: 0,movieId,title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men
3,31357,Waiting to Exhale
4,11862,Father of the Bride Part II


Reformat dictionary keyword data so that each movie has a array of keywords. Movie keywords will be used for the content-based part of the recommendation algorithm.

In [None]:
##convert list of dictionaries in columns to an array of keywords
column_of_lists=[]
for i in range(len(keywords)): 
    row=keywords.loc[i, "keywords"]
    try:
        dictionaries = json.loads(row.replace("'", "\""))
        keyword_list=[]
        for Dict in dictionaries:
            keyword=(Dict['name'])
            keyword_list.append(keyword)
    except:
        keyword_list.append([])
    column_of_lists.append(keyword_list)
    
keywords['word_array']=column_of_lists
keywords['id']=keywords['id'].astype(int)

keywords = keywords.rename(columns={'id': 'movieId'})
keywords = keywords[['movieId', 'word_array']]
keywords.head()

Unnamed: 0,movieId,word_array
0,862,"[jealousy, toy, boy, friendship, friends, riva..."
1,8844,"[board game, disappearance, based on childrens..."
2,15602,"[fishing, best friend, duringcreditsstinger, o..."
3,31357,"[based on novel, interracial relationship, sin..."
4,11862,"[baby, midlife crisis, confidence, aging, daug..."


Concatonate MovieLens data with friends' rating data. 

The movieLens data is very large, so to improve latency I'm filtering out movies that we haven't watched as a group. The resulting dataset consists of user ratings (both from the MovieLens dataset and my friends) from movies that have been selected and discussed for movie club
 

In [None]:
### Alissa    270897
### Byron     270898
### Chelsea   270899
### Hannah    270900
### Harrison  270901
### Martin    270902
### Michael   270903
### Katherine 270904

ratings=pd.concat([ratings, movieclub], ignore_index=True)
ratings.reset_index(drop=True, inplace=True)

# print(ratings.shape)
# print(ratings.tail(100))
# print(ratings.loc[ratings['userId'] == 270897])

movies_watched=list(movieclub.movieId.unique())
movies_to_keep=movies_watched + [480]

movies=movies[movies.movieId.isin(movies_to_keep)]
movies.head()

Unnamed: 0,movieId,title
167,10428,Hackers
232,397,French Kiss
905,8356,An Affair to Remember
1063,9504,Glengarry Glen Ross
1182,279,Amadeus


Also filter out unwatched movies from the keywords dataset

In [None]:
keywords['word_array'] = keywords['word_array'].astype('str') 
keywords= keywords.replace('\[','', regex=True)
keywords= keywords.replace('\]','', regex=True)

keywords=keywords[keywords.movieId.isin(movies_to_keep)]

keywords.head()

Unnamed: 0,movieId,word_array
167,10428,"'female nudity', 'hacker', 'nudity', 'computer..."
232,397,"'paris', 'airport', 'in love with enemy', 'can..."
905,8356,"'painting', 'singer', 'night club', 'cruise sh..."
1063,9504,"'robbery', 'office', 'shop', 'estate agent', '..."
1182,279,"'italy', 'composer', 'opera', 'talent', 'music..."


Merge ratings, movies ad keywords into a single dataframe

In [None]:
ratings2= pd.merge(ratings, movies, on="movieId")
ratings2.head()

data = pd.merge(ratings2, keywords, on="movieId")

data['movieId'] = data['movieId'].astype('str') 

# print(data.isnull().sum())
# print(data.shape)
data.tail(10)

Unnamed: 0,userId,movieId,rating,title,word_array
158784,270901,11072,5.0,Blazing Saddles,"'gun', 'saloon', 'governor', 'marching band', ..."
158785,270902,11072,3.0,Blazing Saddles,"'gun', 'saloon', 'governor', 'marching band', ..."
158786,270903,11072,3.0,Blazing Saddles,"'gun', 'saloon', 'governor', 'marching band', ..."
158787,270898,14353,4.0,Repo! The Genetic Opera,'dystopia'
158788,270899,14353,4.5,Repo! The Genetic Opera,'dystopia'
158789,270900,14353,4.0,Repo! The Genetic Opera,'dystopia'
158790,270901,14353,2.0,Repo! The Genetic Opera,'dystopia'
158791,270902,14353,2.0,Repo! The Genetic Opera,'dystopia'
158792,270903,14353,1.0,Repo! The Genetic Opera,'dystopia'
158793,270904,14353,4.5,Repo! The Genetic Opera,'dystopia'


## LightFM data inputs

The lightFM algorithm takes up to three inputs: the user-item interaction matrix, the item feature matrix and the user feature matrix. 

In [None]:
#### user-item interaction matrix (movie ratings)
df=data[["movieId", "userId", "rating"]].reset_index()
interactions = df.groupby(['userId', 'movieId'])['rating'].sum().unstack().fillna(0)
interactions.tail(10)
# df.shape

movieId,1018,10428,11072,12477,1262,1391,1427,14353,169813,19,1946,20375,20992,210479,24650,24804,253306,26670,279,3175,397,436,480,483,5548,5994,612,6615,71859,793,8356,86829,925,9470,9504,9707,97370
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
270894,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
270896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
270897,0.0,1.0,0.0,3.0,4.0,0.0,1.0,0.0,5.0,0.0,0.5,0.25,0.0,4.0,2.5,0.0,0.0,0.0,0.0,2.5,0.0,3.5,0.0,0.0,0.0,3.0,0.0,0.0,1.5,0.5,0.0,1.0,0.0,3.5,1.0,1.5,0.0
270898,4.5,3.5,3.0,3.0,2.5,4.5,2.5,4.0,4.0,0.0,2.0,1.5,4.0,3.5,1.5,5.0,3.0,4.0,2.5,3.5,0.0,0.0,0.0,0.0,3.5,0.0,4.0,4.5,1.5,3.0,4.5,3.5,4.0,1.5,0.0,1.5,5.0
270899,4.0,3.0,4.0,3.5,3.0,4.5,4.0,4.5,4.5,3.5,3.5,3.5,4.5,2.5,1.0,0.0,4.0,0.0,0.0,2.0,0.0,4.0,0.0,0.0,0.0,1.5,2.5,4.0,4.5,2.0,0.0,3.5,4.5,0.0,1.5,3.0,4.0
270900,5.0,1.0,3.0,1.0,5.0,4.0,2.5,4.0,4.5,3.0,1.5,0.25,0.0,1.5,0.5,1.5,3.5,4.0,3.5,0.0,0.0,4.0,0.0,3.5,4.0,2.0,1.0,3.0,2.0,4.0,4.5,0.25,4.5,3.5,2.0,2.0,0.0
270901,1.5,4.0,5.0,2.5,3.0,1.5,0.5,2.0,1.0,3.5,3.0,5.0,2.5,3.0,0.5,5.0,3.0,3.5,4.0,3.0,4.5,3.0,0.0,2.0,3.5,2.0,3.5,4.0,0.25,1.5,3.0,1.5,3.0,3.5,3.5,2.0,1.5
270902,4.0,3.5,3.0,4.5,3.5,4.5,3.5,2.0,4.0,3.0,2.5,4.0,3.0,3.5,2.0,3.5,3.0,3.0,5.0,4.0,2.0,2.0,0.0,0.0,3.5,2.5,3.5,0.0,3.0,2.5,3.0,2.5,4.0,3.5,3.0,1.5,0.0
270903,0.25,3.0,3.0,4.0,4.5,5.0,4.0,1.0,4.0,3.0,3.0,5.0,4.0,1.5,0.5,1.5,3.5,0.25,5.0,4.0,3.5,4.0,0.0,0.5,5.0,3.5,5.0,4.5,0.25,2.5,3.5,2.0,4.0,4.0,1.0,2.0,5.0
270904,0.25,0.0,0.0,0.0,0.0,0.0,5.0,4.5,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,4.5,0.25,0.0,0.0,0.0,3.5,3.5,0.0,0.0


In [None]:
### user id/index dictionary for accessing user data for predictions 
#user_id = list((interactions.reset_index()).index)
user_id = list(interactions.index)
user_dict = {}
counter = 0
for i in user_id:
    user_dict[i] = counter
    counter += 1

### item dictionary (movie id/ title)
item_dict ={}
for i in range(data.shape[0]):
    item_dict[(data.loc[i,"movieId"])] = data.loc[i,"title"]

item_dict

{'1018': 'Mulholland Drive',
 '10428': 'Hackers',
 '11072': 'Blazing Saddles',
 '12477': 'Grave of the Fireflies',
 '1262': 'Stranger Than Fiction',
 '1391': 'Y Tu Mamá También',
 '1427': 'Perfume: The Story of a Murderer',
 '14353': 'Repo! The Genetic Opera',
 '169813': 'Short Term 12',
 '19': 'Metropolis',
 '1946': 'eXistenZ',
 '20375': 'Raid',
 '20992': 'Brother',
 '210479': 'Locke',
 '24650': 'The Fountainhead',
 '24804': 'Black Dynamite',
 '253306': 'Housebound',
 '26670': 'Noises Off...',
 '279': 'Amadeus',
 '3175': 'Barry Lyndon',
 '397': 'French Kiss',
 '436': 'Maria Full of Grace',
 '480': 'Monsoon Wedding',
 '483': 'Wild at Heart',
 '5548': 'RoboCop',
 '5994': 'The Family Man',
 '612': 'Munich',
 '6615': 'Lars and the Real Girl',
 '71859': 'We Need to Talk About Kevin',
 '793': 'Blue Velvet',
 '8356': 'An Affair to Remember',
 '86829': 'Inside Llewyn Davis',
 '925': 'Do the Right Thing',
 '9470': 'Kung Fu Hustle',
 '9504': 'Glengarry Glen Ross',
 '9707': 'Bubba Ho-tep',
 '973

In [None]:
#### Item feature matrix. Reformat plot keywords into a sparse matrix. This matrix will be used to compute latent variables for the content-based part of the recommendation algorithm
def tokens(x):
    return x.split(', ')
                                                                                 
item_features= data[["movieId", "word_array"]].drop_duplicates(["movieId", "word_array"]).reset_index(drop=True)
                                                                  
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(tokenizer=tokens, max_features = 100)

item_features_csr=cv.fit_transform(item_features['word_array'])

## LightFM recommendation algorithm

In [None]:
### imports
!pip install lightfm
from lightfm import LightFM
import scipy
from scipy import sparse 
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k
from lightfm import LightFM



In [None]:
## Algorithms that LightFM accepts as an argument
# logistic: useful when both positive (1) and negative (-1) interactions are present.
# BPR: Bayesian Personalised Ranking [1] pairwise loss. Maximises the prediction difference between a positive example and a randomly chosen negative example. Useful when only positive interactions are present and optimising ROC AUC is desired.
# WARP: Weighted Approximate-Rank Pairwise [2] loss. Maximises the rank of positive examples by repeatedly sampling negative examples until rank violating one is found. Useful when only positive interactions are present and optimising the top of the recommendation list (precision@k) is desired.
# k-OS WARP: k-th order statistic loss [3]. A modification of WARP that uses the k-th positive example for any given user as a basis for pairwise updates.

## Can also check the collaborative only algorithm accuracy
# model.fit(x,epochs=60,num_threads = 10, item_features=None)

x = sparse.csr_matrix(interactions.values)
train, test = random_train_test_split(interactions=x, test_percentage=0.2)
model = LightFM(no_components= 50, loss='warp')
model.fit(x,epochs=10, item_features=item_features_csr, user_features=None)

<lightfm.lightfm.LightFM at 0x7ff8cc2290d0>

## LightFM model evaluation (collaborative and content-based filtering)

In [None]:
train_precision = precision_at_k(model, train, k=5, item_features=item_features_csr).mean()
test_precision = precision_at_k(model, test, k=5, item_features=item_features_csr).mean()

train_auc = auc_score(model, train, item_features=item_features_csr).mean()
test_auc = auc_score(model, test, item_features=item_features_csr).mean()

print('Precision: train %.2f' % (train_precision))
print('AUC: train %.2f' % (train_auc))
print('Precision: test %.2f' % (test_precision))
print('AUC: test %.2f' % (test_auc))


Precision: train 0.30
AUC: train 1.00
Precision: test 0.22
AUC: test 0.99


In [None]:
## model and accuracy for entire dataframe to use for predictions

full_data = sparse.csr_matrix(interactions.values)
model = LightFM(no_components= 50, loss='warp')
model.fit(full_data ,epochs=10, item_features=item_features_csr)


precision = precision_at_k(model, x, k=5, item_features=item_features_csr).mean()
auc = auc_score(model, x, item_features=item_features_csr).mean()


print('Precision: %.2f' % (precision))
print('AUC:%.2f' % (auc))

Precision: 0.33
AUC:1.00


## Export algorithm and data dictionaries for future recommendations

In [None]:
##save model and dataframes

import pickle
import json
from scipy import sparse
import numpy
pickle.dump(model, open('model.pkl','wb'))

def convert(o):
    if isinstance(o, numpy.int64): return int(o)  
    raise TypeError

#Save user ratings matrix
sparse.save_npz('rating_interaction.npz', x)

### save item feature sparse matrix
sparse.save_npz('item_feature_sparse.npz', item_features_csr)

pickle.dump(interactions, open('interactions.pkl','wb'))

## save and reload item embeddings
item_representations=model.get_item_representations()
pickle.dump(item_representations, open('item_representations.pkl','wb'))
#item_representations = pickle.load(open('item_representations.pkl','rb'))

In [None]:
item_name_dict=cv.vocabulary_

#### save dictionaries
with open('item_name_dict.json', 'w') as fp:
    json.dump(item_name_dict, fp, default=convert)

with open('item_dict.json', 'w') as fp:
    json.dump(item_dict, fp, default=convert)

with open('user_dict.json', 'w') as fp:
    json.dump(user_dict, fp, default=convert)

## Make Predictions for members

In [None]:
### Alissa    270897
### Byron     270898
### Chelsea   270899
### Hannah    270900
### Harrison  270901
### Martin    270902
### Michael   270903
### Katherine 270904

for key, value in user_dict.items():
    if key == 270904.0:
        print(key, '->', value)


270904 -> 95668


In [None]:
n_users, n_items = x.shape

alissa=model.predict(95661,np.arange(n_items), item_features=item_features_csr, user_features=None)
byron=model.predict(95662,np.arange(n_items), item_features=item_features_csr, user_features=None)
chelsea=model.predict(95663,np.arange(n_items), item_features=item_features_csr, user_features=None)
hannah=model.predict(95664,np.arange(n_items), item_features=item_features_csr, user_features=None)
harrison=model.predict(95665,np.arange(n_items), item_features=item_features_csr, user_features=None)
martin=model.predict(95666,np.arange(n_items), item_features=item_features_csr, user_features=None)
michael=model.predict(95667,np.arange(n_items), item_features=item_features_csr, user_features=None)
katherine=model.predict(95668,np.arange(n_items), item_features=item_features_csr, user_features=None)

alissa=list(np.interp(alissa, (alissa.min(), alissa.max()), (0, +5)))
byron=list(np.interp(byron, (byron.min(), byron.max()), (0, +5)))
chelsea=list(np.interp(byron, (chelsea.min(), chelsea.max()), (0, +5)))
hannah=list(np.interp(hannah, (hannah.min(), hannah.max()), (0, +5)))
harrison=list(np.interp(harrison, (harrison.min(), harrison.max()), (0, +5)))
martin=list(np.interp(martin, (martin.min(), martin.max()), (0, +5)))
michael=list(np.interp(michael, (michael.min(), michael.max()), (0, +5)))
katherine=list(np.interp(katherine, (katherine.min(), katherine.max()), (0, +5)))


movieIds=list(interactions.columns)
predictions = pd.DataFrame(list(zip(movieIds, alissa, byron, hannah, chelsea, harrison, martin, michael, katherine)), columns = ['movieIds', 'alissa', 'byron', 'hannah', 'chelsea','harrison', 'martin', 'michael', 'katherine'])

In [None]:
predictions.to_csv('predictions.csv')