In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import scipy as sp
from functools import reduce
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.stats
import scipy.spatial
from math import sqrt
import math

In [2]:
movieActors = pd.read_table('./additional_files/movie_actors.dat', sep='\t', header=0, error_bad_lines=False)
movieDirectors = pd.read_table('./additional_files/movie_directors.dat', sep='\t', header=0, error_bad_lines=False)
movieGenres = pd.read_table('./additional_files/movie_genres.dat', sep='\t', header=0, error_bad_lines=False)
movieTags = pd.read_table('./additional_files/movie_tags.dat', sep='\t', header=0, error_bad_lines=False)
userTaggedMovies = pd.read_table('./additional_files/user_taggedmovies.dat', sep=' ', header=0, error_bad_lines=False)
tags = pd.read_table('./additional_files/tags.dat', sep='\t', header=0, error_bad_lines=False)
test = pd.read_table('./additional_files/test.dat', sep=' ', header=0, error_bad_lines=False)
train = pd.read_table('./additional_files/train.dat', sep=' ', header=0)

In [3]:
def arrange(table, by, joining, column):
    field = table.groupby(by=by).apply(lambda x:[','.join(x[joining])])
    fieldDF = pd.DataFrame(field, columns=[column])
    fieldDF.reset_index(inplace=True)
    return fieldDF

In [4]:
topActors = movieActors.sort_values(['movieID', 'ranking'], ascending=True).groupby('movieID').head(3)
actors = topActors.drop(['actorID', 'ranking'], axis=1)
actorsDF = arrange(actors, 'movieID', 'actorName', 'actors')
directorsDF = arrange(movieDirectors, 'movieID', 'directorName', 'directors')
genreDF = arrange(movieGenres, 'movieID', 'genre', 'genres')

movieTag = pd.merge(movieTags, tags, left_on=['tagID'], right_on=['id']).drop(['id'], axis=1)
movieTagsDF = arrange(movieTag, 'movieID', 'value', 'usertags')

userTaggedMovie = pd.merge(userTaggedMovies, tags, left_on=['tagID'], right_on=['id']).drop(['id'], axis=1)
userTaggedMoviesDF = arrange(userTaggedMovie, 'movieID', 'value', 'tags')

In [5]:
movies = reduce(lambda x, y: pd.merge(x, y, on='movieID', how='outer'), [actorsDF, directorsDF, genreDF, userTaggedMoviesDF, movieTagsDF])

In [6]:
def cleanData(x):
    if isinstance(x, list):
        return [str.lower(i.replace(' ', '')) for i in x]
    else:
        return str.lower(x.replace(' ', '')) if isinstance(x, str) else ''

def mergeDetails(x):
    return ' '.join(x['actors']) + ' ' + ' '.join(x['directors']) + ' ' +' '.join(x['genres']) + ' ' + ' '.join(x['tags'])+ ' ' + ' '.join(x['usertags'])

In [7]:
columns = ['actors', 'directors', 'genres', 'tags', 'usertags']

for column in columns:
    movies[column] = movies[column].apply(cleanData)
    
movies['details'] = movies.apply(mergeDetails, axis=1)

In [8]:
def predict(userId, movieId, similarityValue, values, rate, isCF = True):
    ratingList, weightList = [], []
    predicted = 3.0
    
    for j, i in enumerate(similarityValue):
        try:
            if isCF == True: 
                rating = rate.loc[i, movieId]
                similarity = values[j]
            elif isCF == False:
                rating = rate.loc[userId, i]
                similarity = values[j]
            if np.isnan(rating):
                continue
            elif not np.isnan(rating):
                ratingList.append(rating * similarity)
                weightList.append(similarity)
        except KeyError:                                               
            pass
        
    if sum(weightList) != 0:
        predicted = round(sum(ratingList)/sum(weightList))
        
    if predicted < 0:
        predicted = 0.0
    elif predicted > 5:
        predicted = 5.0
        
    return predicted

In [9]:
def tfidfVectorizer(movies):
    tfidf = TfidfVectorizer(stop_words='english')
    movies['details'] = movies['details'].fillna('')
    return tfidf.fit_transform(movies['details'])

In [10]:
def contentBased(trainDF, testDF, movies, K):
    rate = trainDF.pivot_table(index=['userID'], columns=['movieID'], values='rating')
    tfidf = tfidfVectorizer(movies)
    movieCosineSimilarity = cosine_similarity(tfidf, tfidf) 
    movies = movies.reset_index()
    index = pd.Series(movies.index, index=movies['movieID'])
    output = []
    
    for i, r in tqdm(testDF.iterrows()):
        userId = testDF.loc[i]['userID']
        movieId = testDF.loc[i]['movieID']
        similarityScores = list(enumerate(movieCosineSimilarity[index[movieId]]))
        similarityScores = sorted(similarityScores, key=lambda x: x[1], reverse=True)
        similarityScores = similarityScores[1: K]
        similarityValues = [i[1] for i in similarityScores]
        similarityMovies = [i[0] for i in similarityScores]
        predicted = predict(userId, movieId, similarityMovies, similarityValues, rate, isCF=False)
        output.append(predicted)
        
    with open('cb.txt', 'w') as file:
        for rating in output:
            file.write(str((int(rating))))
            file.write('\n')
            
    return output

In [11]:
def getSimilarity(normalizedRate, sparseRate, normalizedUsers):
    userCosineSimilarity = np.zeros((normalizedUsers, normalizedUsers))
    userCosineSimilarity = cosine_similarity(sparseRate.T)
    userPearsonCorelationSimilarity = np.zeros((normalizedUsers, normalizedUsers))
    
    for i in tqdm(range(normalizedUsers)):
        for j in range(normalizedUsers):
            if np.count_nonzero(normalizedRate.iloc[i, :]) and np.count_nonzero(normalizedRate.iloc[j, :]):
                try:
                    if not math.isnan(scipy.stats.pearsonr(normalizedRate.iloc[i, :], normalizedRate.iloc[j, :])[0]):
                        userPearsonCorelationSimilarity[i][j] = scipy.stats.pearsonr(normalizedRate.iloc[i, :], normalizedRate.iloc[j, :])[0]
                    else:
                        userPearsonCorelationSimilarity[i][j] = 0
                except:
                    userPearsonCorelationSimilarity[i][j] = 0
                    
    return userCosineSimilarity, userPearsonCorelationSimilarity

In [12]:
def preprocess(rate):
    normalizedRate = rate.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)
    normalizedRate.fillna(0, inplace=True)
    normalizedRate = normalizedRate.T
    sparseRate = sp.sparse.csr_matrix(normalizedRate.values)
    return normalizedRate, sparseRate

In [13]:
def correlativeFiltering(trainDF, testDF, k):
    
    normalizedUsers = trainDF.userID.unique().shape[0]
    rate = trainDF.pivot_table(index=['userID'], columns=['movieID'], values='rating')
    normalizedRate, sparseRate = preprocess(rate)
    userCosineSimilarity, userPearsonCorelationSimilarity = getSimilarity(normalizedRate, sparseRate, normalizedUsers)
    userSimilarityDF = pd.DataFrame(userCosineSimilarity, index=normalizedRate.columns, columns=normalizedRate.columns)
    #userSimilarityDF = pd.DataFrame(userPearsonCorelationSimilarity, index=normalizedRate.columns, columns=normalizedRate.columns)
    output = []
    
    for i, r in testDF.iterrows():
        userId = testDF.loc[i]['userID']
        movieId = testDF.loc[i]['movieID']
        userSimilarity = userSimilarityDF.sort_values(by=userId, ascending=False).index[1: k]
        similarityValues = userSimilarityDF.sort_values(by=userId, ascending=False).loc[:, userId].tolist()[1: k]
        predicted = predict(userId, movieId, userSimilarity, similarityValues, rate)
        output.append(predicted)
        
    with open('cf.txt', 'w') as file:
        for rating in output:
            file.write(str((int(rating))))
            file.write('\n')
            
    return output

In [14]:
def crossValidation(train, movies, isCF = True):
    rmse, actual = 0.0, []
    rating = train.loc[:, ['rating']]
    kf = KFold(n_splits=10, shuffle=True, random_state=None)
    
    for k in (50, 500, 1500):
        for trainIndex, testIndex in kf.split(train):
            trainCV = train.loc[trainIndex, :]
            testCV = train.loc[testIndex, :].drop('rating', axis=1)
            actual = rating.iloc[testIndex].values
            
            if isCF == True:
                predict = correlativeFiltering(trainCV, testCV, k)
            elif isCF == False:
                predict = contentBased(trainCV, testCV, movies, k)
                
            rmse += sqrt(mean_squared_error(predict, actual))
            
        print("RMSE for K = " + str(k) + " is " + str(rmse/10))
        rmse = 0.0

In [None]:
crossValidation(train, movies, isCF=True)

100%|██████████| 2113/2113 [33:19<00:00,  1.06it/s]
100%|██████████| 2113/2113 [33:59<00:00,  1.04it/s]
100%|██████████| 2113/2113 [34:21<00:00,  1.03it/s]
100%|██████████| 2113/2113 [33:24<00:00,  1.05it/s]
 64%|██████▍   | 1360/2113 [21:44<12:02,  1.04it/s]

In [None]:
crossValidation(train, movies, isCF=False)

64170it [05:52, 182.29it/s]
64170it [05:52, 181.99it/s]
64170it [05:40, 188.23it/s]
64170it [05:59, 178.70it/s]
64170it [05:53, 181.66it/s]
64170it [05:53, 181.77it/s]
64170it [05:50, 183.20it/s]
64170it [05:56, 179.97it/s]
64170it [05:38, 189.55it/s]
64169it [05:43, 186.63it/s]


RMSE for K = 50 is 1.1675861255291846


64170it [15:38, 68.35it/s]
64170it [15:59, 66.88it/s]
64170it [16:13, 65.94it/s]
64170it [15:33, 68.73it/s]
64170it [15:34, 68.66it/s]
64170it [15:24, 69.38it/s]
64170it [15:20, 69.72it/s]
64170it [15:34, 68.69it/s]
64170it [15:18, 69.90it/s]
64169it [15:21, 69.65it/s]


RMSE for K = 500 is 1.0176048287159696


64170it [37:06, 28.82it/s]
64170it [37:05, 28.83it/s]
64170it [36:51, 29.02it/s]
16786it [10:00, 27.12it/s]

In [15]:
correlativeFiltering(train, test, 500)

100%|██████████| 2113/2113 [33:36<00:00,  1.05it/s]


[3,
 4,
 3,
 3,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 3,
 3,
 4,
 4,
 3,
 4,
 4,
 3,
 4,
 2,
 4,
 2,
 4,
 3,
 4,
 4,
 4,
 4,
 3.0,
 4,
 3,
 2,
 2,
 3,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 3,
 3,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 3,
 4,
 3,
 4,
 4,
 4,
 3,
 4,
 4,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 3,
 4,
 4,
 4,
 2,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 1,
 4,
 4,
 4,
 3,
 3,
 4,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 3,
 4,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 3,
 3,
 3,
 4,
 4,
 4,
 3,
 4,
 3,
 3,
 4,
 4,
 4,
 3,
 4,
 3,
 4,
 4,
 4,
 4,
 3,
 4,
 3,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 2,
 4,
 3,
 4,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 2,
 2,
 2,
 4,
 4,
 4,
 4,
 3,
 4,
 3,
 3,
 3,
 4,
 3,
 3,
 3,
 4,
 4,
 3,
 4,
 3,
 4,
 4,
 2,
 4,
 3,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 3,
 4,
 3,
 3,
 4,
 2,
 3,
 4,
 2,
 4