Imports

In [26]:
import pandas as pd
import numpy as np
import random
import scipy
import json
import nltk
import math
import re
import threading
import copy
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize.regexp import (WordPunctTokenizer,wordpunct_tokenize)

Links for the datasets

In [2]:
url_links = 'https://zrekoj.github.io/hybrid-recommender-system/dataset/links.csv'
url_movies = 'https://zrekoj.github.io/hybrid-recommender-system/dataset/movies.csv'
url_ratings = 'https://zrekoj.github.io/hybrid-recommender-system/dataset/ratings.csv'
url_tags = 'https://zrekoj.github.io/hybrid-recommender-system/dataset/tags.csv'
url_genres = 'https://zrekoj.github.io/hybrid-recommender-system/dataset/user_genre.json'

Datasets

In [3]:
ds_links = pd.read_csv(url_links, dtype = str)
ds_movies = pd.read_csv(url_movies, dtype = str)
ds_ratings = pd.read_csv(url_ratings)
ds_tags = pd.read_csv(url_tags, dtype = str)
ds_user_genres = pd.read_json(url_genres)

In [4]:
list_movies_id = list(ds_movies['movieId'].unique())
list_title_id = ds_movies['title'].tolist()
list_users_id = list(ds_ratings['userId'].unique())
list_movies_imdbid = ds_links['imdbId'].tolist()

Input ratings

In [6]:
movie_ids=[]
ratings=[]

for i in range(10):
  index=random.randint(0,len(list_movies_id)-1)
  while list_movies_id[index] in movie_ids:
    index=random.randint(0,len(list_movies_id)-1)
  rating = float(input('Rate the movie '+str(list_title_id[index])+' from 0.5 to 5.0 or say 0.0 if you haven\'t seen it :'))
  movie_ids.append(list_movies_id[index])
  ratings.append(rating)
print(movie_ids)
print(ratings)


Rate the movie 13th (2016) from 0.5 to 5.0 or say 0.0 if you haven't seen it :9


KeyboardInterrupt: 

Collaborative filtering

SVD

In [5]:
def getGenresByUser(user_id, ds=ds_user_genres):
  return ds[user_id]['like'], ds[user_id]['dislike']

print(getGenresByUser(1))

(['Film-Noir', 'Animation', 'Musical', 'Children', 'Drama', 'War'], ['Comedy', 'Sci-Fi', 'Mystery', 'Thriller', 'Horror'])


In [6]:
#hyperparams

test_ratio = 0.2 #fraction of data to be used as test set.
no_of_features = [8,10,12,14,17,20] # to test the performance over a different number of features

In [7]:
ds_ratings['userId'] = ds_ratings['userId'].astype('str')
ds_ratings['movieId'] = ds_ratings['movieId'].astype('str')

users = ds_ratings['userId'].unique() #list of all users
movies = ds_ratings['movieId'].unique() #list of all movies

test = pd.DataFrame(columns=ds_ratings.columns)
train = pd.DataFrame(columns=ds_ratings.columns)

for u in users:
  temp = ds_ratings[ds_ratings['userId'] == u]
  n = len(temp)
  test_size = int(test_ratio*n)

  temp = temp.sort_values('timestamp').reset_index()
  temp.drop('index', axis=1, inplace=True)

  dummy_test = temp.loc[n-1-test_size :]
  dummy_train = temp.loc[: n-2-test_size]
    
  test = pd.concat([test, dummy_test])
  train = pd.concat([train, dummy_train])

In [8]:
from scipy.linalg import sqrtm

def create_user_item_matrix(data, formatizer = {'user':0, 'item': 1, 'value': 2}):
    itemField = formatizer['item']
    userField = formatizer['user']
    valueField = formatizer['value']

    
    userList = data.iloc[:,userField].tolist()
    itemList = data.iloc[:,itemField].tolist()
    valueList = data.iloc[:,valueField].tolist()

    users = list(set(data.iloc[:,userField]))
    items = list(set(data.iloc[:,itemField]))

    users_index = {users[i]: i for i in range(len(users))}

    pd_dict = {item: [np.nan for i in range(len(users))] for item in items}

    for i in range(0,len(data)):
      item = itemList[i]
      user = userList[i]
      value = valueList[i]

      pd_dict[item][users_index[user]] = value
    
    X = pd.DataFrame(pd_dict)
    X.index = users

    itemcols = list(X.columns)
    items_index = {itemcols[i]: i for i in range(len(itemcols))}

    # users_index gives us a mapping of user_id to index of user
    # items_index provides the same for items

    return X, users_index, items_index

In [9]:
def svd(train, k):
    utilMat = np.array(train)

    # the nan or unavailable entries are masked
    mask = np.isnan(utilMat)
    masked_arr = np.ma.masked_array(utilMat, mask)

    item_means = np.mean(masked_arr, axis=0)

    # nan entries will replaced by the average rating for each item
    utilMat = masked_arr.filled(item_means)
    x = np.tile(item_means, (utilMat.shape[0],1))

    # we remove the per item average from all entries.
    # the above mentioned nan entries will be essentially zero now
    utilMat = utilMat - x

    # The magic happens here. U and V are user and item features
    U, s, V=np.linalg.svd(utilMat, full_matrices=False)
    s=np.diag(s)

    # we take only the k most significant features
    s=s[0:k,0:k]
    U=U[:,0:k]
    V=V[0:k,:]

    s_root=sqrtm(s)
    Usk=np.dot(U,s_root)
    skV=np.dot(s_root,V)
    UsV = np.dot(Usk, skV)
    UsV = UsV + x
    return UsV

In [10]:
def rmse(true, pred):
    x = true - pred
    return sum([xi*xi for xi in x])/len(x)

In [11]:
uiMat, users_index, items_index = create_user_item_matrix(train)

In [12]:
#svd evaluation
for f in no_of_features:
  svdout = svd(uiMat, k=f)
  pred = [] #to store the predicted ratings
  for _,row in test.iterrows():
    user = row['userId']
    item = row['movieId']
    u_index = users_index[user]
    
    if item in items_index:
      i_index = items_index[item]
      pred_rating = svdout[u_index, i_index]
    else:
      pred_rating = np.mean(svdout[u_index, :])
    pred.append(pred_rating)
  print(rmse(test['rating'], pred))

1.0037878092372208
1.0036410576245884
1.0047280968304317
1.0053854409715488
1.0046710025637098
1.0056567229647215


In [13]:
def filter(u_id, user_genres = ds_user_genres, item_genres = ds_movies["genres"]):
  u_likes = user_genres[u_id]["like"]
  u_detest = user_genres[u_id]["dislike"]

  useritemMat, users_index, items_index = create_user_item_matrix(ds_ratings)
  svdout = svd(useritemMat, k=12)
  user_pos = users_index[str(u_id)]
  result = []
  for i in range(len(svdout[user_pos])):

    if str(i) in items_index:
      i_id = items_index[str(i)]
      i_genre = item_genres[i_id].split(sep="|")

      checkDislike =  any(item in i_genre for item in u_detest)
      if checkDislike is False:
        new_rate = svdout[user_pos][i]

        checkLike =  any(item in i_genre for item in u_likes)
        if checkLike is True:
          new_rate = new_rate + 2
        result.append([ds_movies['movieId'][i_id], ds_movies["title"][i_id], new_rate])
 
  return sorted(result, key=lambda item: item[2], reverse=True)[:20]



In [14]:
check =  any(item in ["Horror", "Comedy"] for item in ["Comedy", "Sci-Fi", "Mystery", "Thriller"])
if check is True:
    print("pass") 

pass


In [16]:
filter(1)

[['25937', 'Easter Parade (1948)', 7.0],
 ['385', 'Man of No Importance, A (1994)', 7.0],
 ['6832', 'Regarding Henry (1991)', 7.0],
 ['6216', 'Nowhere in Africa (Nirgendwo in Afrika) (2001)', 7.0],
 ['64499', 'Che: Part One (2008)', 7.0],
 ['1507', 'Paradise Road (1997)', 7.0],
 ['1415', 'Thieves (Voleurs, Les) (1996)', 7.0],
 ['1401', 'Ghosts of Mississippi (1996)', 7.0],
 ['3000', 'Princess Mononoke (Mononoke-hime) (1997)', 7.0],
 ['5471', 'Perfect (1985)', 7.0],
 ['4896',
  "Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)",
  7.0],
 ['55274', 'Elizabeth: The Golden Age (2007)', 7.0],
 ['7243', "Intolerance: Love's Struggle Throughout the Ages (1916)", 7.0],
 ['851', 'Basquiat (1996)', 7.0],
 ['25753', 'Greed (1924)', 7.0],
 ['918', 'Meet Me in St. Louis (1944)', 7.0],
 ['114795', 'Dracula Untold (2014)', 7.0],
 ['54997', '3:10 to Yuma (2007)', 7.0],
 ['157122', 'The Man Who Knew Infinity (2016)', 7.0],
 ['6777', 'Judgment at Nuremberg (

**Content-based**

In [15]:
def cosine_similarity(u, v):
    return np.dot(u, np.transpose(v)) / np.linalg.norm(u) * np.linalg.norm(v)

In [16]:
def readPlots(movies):
  plots = []
  for movie in movies:
    url = 'https://zrekoj.github.io/hybrid-recommender-system/movies/'+ str(movie)+'.json'
    archivo = pd.read_json(url)
    plots.append([archivo['Plot'].tolist()[0]])
  return plots


Build Frecuency Dictionary

In [17]:
def preprocess_text(doc):
 stopset = set(stopwords.words('english'))
 stemmer = SnowballStemmer('english',ignore_stopwords=True)
 tokens = wordpunct_tokenize(doc)
 clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
 stemmed_text = [stemmer.stem(word) for word in clean]
 return stemmed_text
 
 
def create_freq_dict(movieIds, preprocessed_texts):
 i=0
 print("Creating frequency dictionary")
 freqDict_list=[]
 all_tokens=set()
 #for j in range(0, len(preprocessed_texts)-1):
   #for token in preprocessed_texts[j]:
     #all_tokens.add(token)
 for tokens in preprocessed_texts:
  freq_dict={}
  
  for token in tokens:
    if token in freq_dict:
     freq_dict[token]+=1
    else:
     freq_dict[token]=1
  #for token in all_tokens:
    #if token not in freq_dict:
       #freq_dict[token]=0

  temp={'term_id':i,'movie_id':movieIds[i],'freq_dict':freq_dict,'doc_length':len(tokens)}
  i+=1
  freqDict_list.append(temp)
 return freqDict_list

Calculate TF-IDF

Content-Based application

In [18]:
def computeTF(freqDict_list):
 TF_scores=[]
 print("Calculating TF")
 for tempDict in freqDict_list:
  id=tempDict['term_id']
  name=tempDict['movie_id']
  for k in tempDict['freq_dict']:
   temp={'term_id':id, 'movie_id':name,'TF_score':tempDict['freq_dict'][k]/tempDict['doc_length'],'key':k}
   TF_scores.append(temp)
 return TF_scores
 
def computeIDF(freqDict_list):
  IDF_scores=[]
  print("Calculating IDF")
  counter=-1
  for dict in freqDict_list:
   counter+=1
   for k in dict['freq_dict'].keys():
    count=sum([k in tempDict['freq_dict'] for tempDict in freqDict_list])
    temp= {'term_id': counter, 'IDF_score':math.log(dict['doc_length'])/count,'key':k}
    IDF_scores.append(temp)
  return IDF_scores
 
def computeTFIDF(TF_scores, IDF_scores):
 TF_IDF_scores=[]
 print("Calculating TF-IDF")
 
 for j in IDF_scores:
  for i in TF_scores:
   if j['key']==i['key'] and j['term_id']==i['term_id']:
    temp={'term_id':j['term_id'], 'movie_id':i['movie_id'], 'TFIDF_score': j['IDF_score']*i['TF_score'], 'key':i['key']}
    TF_IDF_scores.append(temp)
 return TF_IDF_scores

In [19]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Zihao\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
#zihao

import requests 

user_id = 1
hybrid = False

movie_similarities=[]

#Get imbdIds of user movies
user_movies = list(
    map(
        lambda movie_id : str(ds_links.loc[ds_links['movieId'] == str(movie_id), 'imdbId'].values[0]), 
        ds_ratings.loc[ds_ratings['userId'] == str(user_id), 'movieId'].tolist()
    )
)

#collaborative_filtering_result = filter(user_id)
movies=[]

#If we are using content-based as part of the hybrid filter, it uses the results of the collaborative filtering
#Otherwise it uses all movies
if(hybrid):
    movies = list(
        map(
            lambda result : ds_links.loc[ds_links['movieId'] == str(result[0]), 'imdbId'].values[0], 
            collaborative_filtering_result
        )
    )
else:
    movies=[str(movie_id) for movie_id in list_movies_imdbid if movie_id not in user_movies]
    
response = requests.get('https://zrekoj.github.io/hybrid-recommender-system/user_frequency/'+ str(user_id)+'.json') 
user_frequence_dict = response.json()

def computeTFIDF(user_frequence_dict):
    copied = copy.deepcopy(user_frequence_dict)
    for frequency in copied.values():
        length = len(frequency)
        total_count = sum(frequency.values())
        for key in frequency:
            frequency[key] = (frequency[key] / length) * (math.log(length) / total_count)
    return copied
            
for movie in movies:
    MaxSim = 0
    
    movie_frequence_dict = requests.get('https://zrekoj.github.io/hybrid-recommender-system/frequency/'+ str(movie)+'.json').json() 
    
    user_frequence_dict[str(movie)] = movie_frequence_dict
    TFIDF = computeTFIDF(user_frequence_dict)
    user_frequence_dict.pop(str(movie), None)
    
    TFIDF_highest = TFIDF[str(movie)]
    
    for user_movie in user_movies[:10]:
        TFIDF_user =  TFIDF[str(user_movie)]
        TFIDF_user_values = list(
            map(
                lambda key : TFIDF_user[key] if key in TFIDF_user.keys() else 0,
                TFIDF_highest.keys()
            )
        )
        
        similarity = cosine_similarity(list(TFIDF_highest.values()), TFIDF_user_values)        
        
        if(similarity > MaxSim):
            MaxSim = similarity
    movie_similarities.append([movie, MaxSim])
    print(movie, MaxSim)

sorted(movie_similarities, key=lambda item: item[1], reverse=True)

print(movie_similarities)



0113497 3.543544460041931e-06
0114885 7.309774086299786e-06
0113041 6.757274151956908e-06
0114319 3.2347987238123826e-06
0112302 1.8611607309117507e-05
0114576 6.34765936756794e-06


In [None]:
def content_based(user_id, hybrid=True):

  movie_similarities=[]
  temp_user_movies=[]
  temp_movies=[]
  user_movies=[]
  #Get movies of selected user
  for k in range(0,len(ds_ratings)-1):
    if str(ds_ratings['userId'][k])==str(user_id):
      temp_user_movies.append(ds_ratings['movieId'][k])
  #Get imbdIds of user movies
  for k in range(0,len(ds_links)-1):
    if str(ds_links['movieId'][k]) in temp_user_movies:
      user_movies.append(str(ds_links['imdbId'][k]))
  
  movies=[]
  
  #If we are using content-based as part of the hybrid filter, it uses the results of the collaborative filtering
  #Otherwise it uses all movies
  if(hybrid):
    temp_movies=[str(x[0]) for x in filter(user_id)]
    for k in range(0,len(ds_links)):
      if str(ds_links['movieId'][k]) in temp_movies:
        movies.append(str(ds_links['imdbId'][k]))
  else:
    movies=[str(movie_id) for movie_id in list_movies_imdbid if movie_id not in user_movies]
  #Preprocess user movies
  preprocessed_texts=[preprocess_text(text[0]) for text in readPlots(user_movies)]

  for i in range(0,len(movies)-1):
    MaxSim=0
    user_movies.append(movies[i])
    preprocessed_texts.append(preprocess_text(readPlots([movies[i]])[0][0]))
    freqDict=create_freq_dict(user_movies, preprocessed_texts)
    user_movies.pop(-1)
    preprocessed_texts.pop(-1)
    TFIDF=computeTFIDF(computeTF(freqDict),computeIDF(freqDict))

    TFIDF_highest_temp=[[y["key"],y["TFIDF_score"]] for y in TFIDF if y["movie_id"]==movies[i]]
    TFIDF_highest=[y[1] for y in TFIDF_highest_temp]
    #TFIDF_highest=list(map(lambda y : y["TFIDF_score"] if y["movie_id"]==movies[i] else 0, TFIDF))
    
    for j in range(0,9):
      
      TFIDF_user_temp={x["key"]:x["TFIDF_score"] for x in TFIDF if x["movie_id"]==user_movies[j]}
      TFIDF_user=[]
      for x in TFIDF_highest_temp:
        if x[0] in TFIDF_user_temp.keys():
          TFIDF_user.append(TFIDF_user_temp[x[0]])
        else:
          TFIDF_user.append(0)

      #TFIDF_user=list(map(lambda y : y["TFIDF_score"] if y["movie_id"]==user_movies[i] else 0, TFIDF))
      
      similarity=cosine_similarity(TFIDF_highest,TFIDF_user)
      if(similarity>MaxSim):
        MaxSim=similarity
    movie_similarities.append([movies[i],MaxSim])
  return sorted(movie_similarities, key=lambda item: item[1], reverse=True)

print(content_based(1))





Fuzzy Expert

In [None]:
import skfuzzy as fuzz
from skfuzzy import control as ctrl
import matplotlib.pyplot as plt

In [None]:
inp1 = ctrl.Antecedent(np.arange(0, 5, 1), 'average_rating')
inp2 = ctrl.Antecedent(np.arange(0, 350, 1), 'total_ratings')
inp3 = ctrl.Antecedent(np.arange(0, 1, 1), 'similarity')
out = ctrl.Consequent(np.arange(0, 1, 1), 'importance')

out['low'] = fuzz.trimf(out.universe, [0, 0, .2])
out['medium'] = fuzz.trimf(out.universe, [0, .6, 25])
out['high'] = fuzz.trimf(out.universe, [.6, 1, 25])