Imports

In [1]:
import pandas as pd
import numpy as np
import random
import scipy
import json
import nltk
import math
import re
import threading
import copy
import requests 
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize.regexp import (WordPunctTokenizer,wordpunct_tokenize)

Links for the datasets

In [2]:
url_links = 'https://zrekoj.github.io/hybrid-recommender-system/dataset/links.csv'
url_movies = 'https://zrekoj.github.io/hybrid-recommender-system/dataset/movies.csv'
url_ratings = 'https://zrekoj.github.io/hybrid-recommender-system/dataset/ratings.csv'
url_tags = 'https://zrekoj.github.io/hybrid-recommender-system/dataset/tags.csv'
url_genres = 'https://zrekoj.github.io/hybrid-recommender-system/dataset/user_genre.json'

Datasets

In [3]:
ds_links = pd.read_csv(url_links, dtype = str)
ds_movies = pd.read_csv(url_movies, dtype = str)
ds_ratings = pd.read_csv(url_ratings)
ds_tags = pd.read_csv(url_tags, dtype = str)
ds_user_genres = pd.read_json(url_genres)

In [4]:
list_movies_id = list(ds_movies['movieId'].unique())
list_title_id = ds_movies['title'].tolist()
list_users_id = list(ds_ratings['userId'].unique())
list_movies_imdbid = ds_links['imdbId'].tolist()

Input ratings

In [5]:
'''
movie_ids=[]
ratings=[]

for i in range(10):
  index=random.randint(0,len(list_movies_id)-1)
  while list_movies_id[index] in movie_ids:
    index=random.randint(0,len(list_movies_id)-1)
  rating = float(input('Rate the movie '+str(list_title_id[index])+' from 0.5 to 5.0 or say 0.0 if you haven\'t seen it :'))
  movie_ids.append(list_movies_id[index])
  ratings.append(rating)
print(movie_ids)
print(ratings)
'''


"\nmovie_ids=[]\nratings=[]\n\nfor i in range(10):\n  index=random.randint(0,len(list_movies_id)-1)\n  while list_movies_id[index] in movie_ids:\n    index=random.randint(0,len(list_movies_id)-1)\n  rating = float(input('Rate the movie '+str(list_title_id[index])+' from 0.5 to 5.0 or say 0.0 if you haven't seen it :'))\n  movie_ids.append(list_movies_id[index])\n  ratings.append(rating)\nprint(movie_ids)\nprint(ratings)\n"

Collaborative filtering

SVD

In [6]:
def getGenresByUser(user_id, ds=ds_user_genres):
  return ds[user_id]['like'], ds[user_id]['dislike']

print(getGenresByUser(1))

(['Film-Noir', 'Animation', 'Musical', 'Children', 'Drama', 'War'], ['Comedy', 'Sci-Fi', 'Mystery', 'Thriller', 'Horror'])


In [7]:
#hyperparams

test_ratio = 0.2 #fraction of data to be used as test set.
no_of_features = [8,10,12,14,17,20] # to test the performance over a different number of features

In [8]:
ds_ratings['userId'] = ds_ratings['userId'].astype('str')
ds_ratings['movieId'] = ds_ratings['movieId'].astype('str')

users = ds_ratings['userId'].unique() #list of all users
movies = ds_ratings['movieId'].unique() #list of all movies

test = pd.DataFrame(columns=ds_ratings.columns)
train = pd.DataFrame(columns=ds_ratings.columns)

for u in users:
  temp = ds_ratings[ds_ratings['userId'] == u]
  n = len(temp)
  test_size = int(test_ratio*n)

  temp = temp.sort_values('timestamp').reset_index()
  temp.drop('index', axis=1, inplace=True)

  dummy_test = temp.loc[n-1-test_size :]
  dummy_train = temp.loc[: n-2-test_size]
    
  test = pd.concat([test, dummy_test])
  train = pd.concat([train, dummy_train])

In [9]:
from scipy.linalg import sqrtm

def create_user_item_matrix(data, formatizer = {'user':0, 'item': 1, 'value': 2}):
    itemField = formatizer['item']
    userField = formatizer['user']
    valueField = formatizer['value']

    
    userList = data.iloc[:,userField].tolist()
    itemList = data.iloc[:,itemField].tolist()
    valueList = data.iloc[:,valueField].tolist()

    users = list(set(data.iloc[:,userField]))
    items = list(set(data.iloc[:,itemField]))

    users_index = {users[i]: i for i in range(len(users))}

    pd_dict = {item: [np.nan for i in range(len(users))] for item in items}

    for i in range(0,len(data)):
      item = itemList[i]
      user = userList[i]
      value = valueList[i]

      pd_dict[item][users_index[user]] = value
    
    X = pd.DataFrame(pd_dict)
    X.index = users

    itemcols = list(X.columns)
    items_index = {itemcols[i]: i for i in range(len(itemcols))}

    # users_index gives us a mapping of user_id to index of user
    # items_index provides the same for items

    return X, users_index, items_index

In [10]:
def svd(train, k):
    utilMat = np.array(train)

    # the nan or unavailable entries are masked
    mask = np.isnan(utilMat)
    masked_arr = np.ma.masked_array(utilMat, mask)

    item_means = np.mean(masked_arr, axis=0)

    # nan entries will replaced by the average rating for each item
    utilMat = masked_arr.filled(item_means)
    x = np.tile(item_means, (utilMat.shape[0],1))

    # we remove the per item average from all entries.
    # the above mentioned nan entries will be essentially zero now
    utilMat = utilMat - x

    # The magic happens here. U and V are user and item features
    U, s, V=np.linalg.svd(utilMat, full_matrices=False)
    s=np.diag(s)

    # we take only the k most significant features
    s=s[0:k,0:k]
    U=U[:,0:k]
    V=V[0:k,:]

    s_root=sqrtm(s)
    Usk=np.dot(U,s_root)
    skV=np.dot(s_root,V)
    UsV = np.dot(Usk, skV)
    UsV = UsV + x
    return UsV

In [11]:
def rmse(true, pred):
    x = true - pred
    return sum([xi*xi for xi in x])/len(x)

In [12]:
uiMat, users_index, items_index = create_user_item_matrix(train)

In [13]:
#svd evaluation
for f in no_of_features:
  svdout = svd(uiMat, k=f)
  pred = [] #to store the predicted ratings
  for _,row in test.iterrows():
    user = row['userId']
    item = row['movieId']
    u_index = users_index[user]
    
    if item in items_index:
      i_index = items_index[item]
      pred_rating = svdout[u_index, i_index]
    else:
      pred_rating = np.mean(svdout[u_index, :])
    pred.append(pred_rating)
  print(rmse(test['rating'], pred))

1.0037878092372208
1.0036410576245893
1.0047280968304313
1.005385440971549
1.0046710025637098
1.0056567229647213


In [14]:
def filter(u_id, user_genres = ds_user_genres, item_genres = ds_movies["genres"]):
  u_likes = user_genres[u_id]["like"]
  u_detest = user_genres[u_id]["dislike"]

  useritemMat, users_index, items_index = create_user_item_matrix(ds_ratings)
  svdout = svd(useritemMat, k=12)
  user_pos = users_index[str(u_id)]
  result = []
  for i in range(len(svdout[user_pos])):

    if str(i) in items_index:
      i_id = items_index[str(i)]
      i_genre = item_genres[i_id].split(sep="|")

      checkDislike =  any(item in i_genre for item in u_detest)
      if checkDislike is False:
        new_rate = svdout[user_pos][i]

        checkLike =  any(item in i_genre for item in u_likes)
        if checkLike is True:
          new_rate = new_rate + 2
        result.append([ds_movies['movieId'][i_id], ds_movies["title"][i_id], new_rate])
 
  return sorted(result, key=lambda item: item[2], reverse=True)[:20]



In [15]:
check =  any(item in ["Horror", "Comedy"] for item in ["Comedy", "Sci-Fi", "Mystery", "Thriller"])
if check is True:
    print("pass") 

pass


In [16]:
filter(1)

[['108192',
  "Hotel Chevalier (Part 1 of 'The Darjeeling Limited') (2007)",
  7.0],
 ['2304', 'Love Is the Devil (1998)', 7.0],
 ['3173', 'Any Given Sunday (1999)', 7.0],
 ['2077', 'Journey of Natty Gann, The (1985)', 7.0],
 ['7167', 'Japanese Story (2003)', 7.0],
 ['979', 'Nothing Personal (1995)', 7.0],
 ['49', 'When Night Is Falling (1995)', 7.0],
 ['1027', 'Robin Hood: Prince of Thieves (1991)', 7.0],
 ['7054', 'Little Women (1949)', 7.0],
 ['4260', 'Visit, The (2000)', 7.0],
 ['85213', 'Sunset Limited, The (2011)', 7.0],
 ['4078', 'Amazing Grace and Chuck (1987)', 7.0],
 ['4334', 'Yi Yi (2000)', 7.0],
 ['347', 'Bitter Moon (1992)', 7.0],
 ['6886', 'Beyond Borders (2003)', 7.0],
 ['26903', 'Whisper of the Heart (Mimi wo sumaseba) (1995)', 7.0],
 ['1844', 'Live Flesh (Carne trémula) (1997)', 7.0],
 ['146', 'Amazing Panda Adventure, The (1995)', 7.0],
 ['3792', 'Duel in the Sun (1946)', 7.0],
 ['156607', "The Huntsman Winter's War (2016)", 7.0]]

**Content-based**

In [17]:
def cosine_similarity(u, v):
    return np.dot(u, np.transpose(v)) / (np.linalg.norm(u) * np.linalg.norm(v))

Calculate TF-IDF

In [32]:
 def computeTFIDF(user_frequence_dict):
        copied = copy.deepcopy(user_frequence_dict)
        total_movies = len(copied)
        total_word_movies_count = {}

        for frequency in copied.values():
            for word in frequency:
                total_word_movies_count[word] = total_word_movies_count.get(word, 0) + 1

        for frequency in copied.values():
            total_words_count = len(frequency)
            for word in frequency:
                #                TF                                 IDF
                frequency[word] = (frequency[word] / total_words_count) * (math.log(1 + (total_movies / total_word_movies_count[word])))
        return copied

Content-Based application

In [39]:
def content_based(user_id, hybrid=True):

    movie_similarities=[]

    #Get imbdIds of user movies
    user_movies = list(
        map(
            lambda movie_id : str(ds_links.loc[ds_links['movieId'] == str(movie_id), 'imdbId'].values[0]), 
            ds_ratings.loc[ds_ratings['userId'] == str(user_id), 'movieId'].tolist()
        )
    )

    #collaborative_filtering_result = filter(user_id)

    collaborative_filtering_result = [['108192',
      "Hotel Chevalier (Part 1 of 'The Darjeeling Limited') (2007)",
      7.0],
     ['2304', 'Love Is the Devil (1998)', 7.0],
     ['3173', 'Any Given Sunday (1999)', 7.0],
     ['2077', 'Journey of Natty Gann, The (1985)', 7.0],
     ['7167', 'Japanese Story (2003)', 7.0],
     ['979', 'Nothing Personal (1995)', 7.0],
     ['49', 'When Night Is Falling (1995)', 7.0],
     ['1027', 'Robin Hood: Prince of Thieves (1991)', 7.0],
     ['7054', 'Little Women (1949)', 7.0],
     ['4260', 'Visit, The (2000)', 7.0],
     ['85213', 'Sunset Limited, The (2011)', 7.0],
     ['4078', 'Amazing Grace and Chuck (1987)', 7.0],
     ['4334', 'Yi Yi (2000)', 7.0],
     ['347', 'Bitter Moon (1992)', 7.0],
     ['6886', 'Beyond Borders (2003)', 7.0],
     ['26903', 'Whisper of the Heart (Mimi wo sumaseba) (1995)', 7.0],
     ['1844', 'Live Flesh (Carne trémula) (1997)', 7.0],
     ['146', 'Amazing Panda Adventure, The (1995)', 7.0],
     ['3792', 'Duel in the Sun (1946)', 7.0],
     ['156607', "The Huntsman Winter's War (2016)", 7.0]]

    movies=[]

    #If we are using content-based as part of the hybrid filter, it uses the results of the collaborative filtering
    #Otherwise it uses all movies
    if(hybrid):
        movies = list(
            map(
                lambda result : ds_links.loc[ds_links['movieId'] == str(result[0]), 'imdbId'].values[0], 
                collaborative_filtering_result
            )
        )
    else:
        movies=[str(movie_id) for movie_id in list_movies_imdbid if movie_id not in user_movies]

    response = requests.get('https://zrekoj.github.io/hybrid-recommender-system/user_frequency/'+ str(user_id)+'.json') 
    user_frequence_dict = response.json()

    for movie in movies:
        MaxSim = 0

        movie_frequence_dict = requests.get('https://zrekoj.github.io/hybrid-recommender-system/frequency/'+ str(movie)+'.json').json() 
        user_frequence_dict[str(movie)] = movie_frequence_dict
        TFIDF = computeTFIDF(user_frequence_dict)
        user_frequence_dict.pop(str(movie), None)

        no_exist_string = "***###***"
        
        TFIDF_highest = TFIDF[str(movie)]
        for user_movie in user_movies:
            TFIDF_user =  TFIDF[str(user_movie)]
            
            TFIDF_highest_keys = TFIDF_highest.keys()
            TFIDF_user_keys = TFIDF_user.keys()
            
            coincidence = list(set(TFIDF_highest_keys) & set(TFIDF_user_keys))
            TFIDF_highest_values_diff = list(set(TFIDF_highest_keys) - set(TFIDF_user_keys))
            TFIDF_user_values_diff = list(set(TFIDF_user_keys) - set(TFIDF_highest_keys))
            
            TFIDF_highest_values = coincidence + TFIDF_highest_values_diff + ([no_exist_string] * len(TFIDF_user_values_diff))
            TFIDF_user_values = coincidence + ([no_exist_string] * len(TFIDF_highest_values_diff)) + TFIDF_user_values_diff
            
            TFIDF_highest_values = list(
                map(
                    lambda value : TFIDF_highest[value] if value != no_exist_string else 0,
                    TFIDF_highest_values
                )
            )
            
            TFIDF_user_values = list(
                map(
                    lambda value : TFIDF_user[value] if value != no_exist_string else 0,
                    TFIDF_user_values
                )
            )
            
            similarity = cosine_similarity(TFIDF_highest_values, TFIDF_user_values)        

            if(similarity > MaxSim):
                MaxSim = similarity
                
        movie_similarities.append([movie, MaxSim])
        
        print(movie, MaxSim)

    return sorted(movie_similarities, key=lambda item: item[1], reverse=True)

print(content_based(1))

1094249 0.07950933188193429
0119577 0.03278998241794748
0146838 0.06455013019546886
0089385 0.058154713046292245
0304229 0.0855014873973386
0114007 0.05510535815109053
0114916 0.059594410993000585
0102798 0.1730762940149346
0041594 0.0705700332960813
0199129 0.12967858576323768
1510938 0.07786330345947903
0092545 0.07533071561265856
0244316 0.05788941191858763
0104779 0.05918078229511197
0294357 0.12049458341852527
0113824 0.125902194245651
0118819 0.08952145915043482
0112342 0.09077668893012351
0038499 0.06793468001299899
2381991 0.06856206481457756
[['0102798', 0.1730762940149346], ['0199129', 0.12967858576323768], ['0113824', 0.125902194245651], ['0294357', 0.12049458341852527], ['0112342', 0.09077668893012351], ['0118819', 0.08952145915043482], ['0304229', 0.0855014873973386], ['1094249', 0.07950933188193429], ['1510938', 0.07786330345947903], ['0092545', 0.07533071561265856], ['0041594', 0.0705700332960813], ['2381991', 0.06856206481457756], ['0038499', 0.06793468001299899], ['014

Fuzzy Expert

In [None]:
import skfuzzy as fuzz
from skfuzzy import control as ctrl
import matplotlib.pyplot as plt

In [None]:
inp1 = ctrl.Antecedent(np.arange(0, 5, 1), 'average_rating')
inp2 = ctrl.Antecedent(np.arange(0, 350, 1), 'total_ratings')
inp3 = ctrl.Antecedent(np.arange(0, 1, 1), 'similarity')
out = ctrl.Consequent(np.arange(0, 1, 1), 'importance')

out['low'] = fuzz.trimf(out.universe, [0, 0, .2])
out['medium'] = fuzz.trimf(out.universe, [0, .6, 25])
out['high'] = fuzz.trimf(out.universe, [.6, 1, 25])