In [1]:
import pandas as pd
import numpy as np
import logging
import warnings
import time
from IPython.core.interactiveshell import InteractiveShell

import plots
from Old_Versions import Original_splitData as sd
import metrics

# Used to get multiple outputs per cell
InteractiveShell.ast_node_interactivity = "all"

warnings.filterwarnings("ignore")  # ignore warnings in logs

logging.basicConfig(format='%(asctime)s - %(message)s',
                    level=logging.INFO)  # Logging configuration


In [13]:
# Setting variables
k = 10  # Define the k neighbors
bounds = (1, 5)  # max and min boundaries
threshold = 0  # Threshold for similarity neighborhood
popular_movie = 50

In [3]:
logging.info('Loading Data Set')
headers = ['userId', 'movieId', 'movie_categoryId',
            'reviewId', 'movieRating', 'reviewDate']
columns = ['userId', 'movieId', 'movie_categoryId', 'movieRating']
data_set = pd.read_csv('Dataset/movie-ratings.txt',
                           sep=',', names=headers, usecols=columns, dtype={'userId': 'int', 'movieId': 'int', 'movie_categoryId': 'int'})
data_set.shape

2020-05-04 23:30:17,723 - Loading Data Set


(72665, 4)

In [4]:
ratings = pd.DataFrame(data_set.groupby('movieId')['movieRating'].mean())
ratings['ratings_per_movie'] = data_set.groupby('movieId')['movieRating'].count()
#ratings

In [5]:
unpopular_movies = ratings.loc[ratings['ratings_per_movie'] < popular_movie].index
data_set.drop(data_set.loc[data_set['movieId'].isin(unpopular_movies)].index, inplace=True)
data_set.shape

(16030, 4)

In [6]:
dense_data = data_set.pivot_table(index = 'userId', columns ='movieId', values = 'movieRating')
index_names = dense_data.index
columns_names = dense_data.columns
dense_data = dense_data.values
dense_data.shape

(7632, 179)

In [7]:
train,test = sd.split_train_test(dense_data,0.2)
train.shape
test.shape

(7632, 179)

(7632, 179)

In [8]:
user_avg = np.nanmean(train, axis=1)
user_avg.shape

(7632,)

In [9]:
start = time.time()
pearson_corr = pd.DataFrame(train.T, columns=index_names).corr()
logging.info("Process done in: {0:.2f} seconds".format(
        time.time() - start))
    

2020-05-04 23:30:23,987 - Process done in: 5.39 seconds


In [10]:
#pearson_corr

In [14]:
qou = int(index_names.shape[0])
neighbors = np.zeros((qou, k)).astype(int)
item=0
start = time.time()
for i in index_names:
    user_corr = pearson_corr[i].drop([i]).sort_values(ascending=False)    
    neighbors[item] = user_corr.iloc[:k].index.values
    item = item + 1
logging.info("Process done in: {0:.2f} seconds".format(
        time.time() - start))

2020-05-04 23:31:20,322 - Process done in: 9.82 seconds


In [15]:
pd.DataFrame(neighbors, index = index_names)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,8334,369,161,11,4,8518,432,2,5,6
2,35,228,823,141,380,350,157,225,27,61
4,1,9030,11,8460,8334,8150,3957,3870,3802,168
5,810,2582,7610,6417,6199,6179,5665,3923,3879,3765
6,3887,2615,796,322,12,42,3795,205,5665,24
...,...,...,...,...,...,...,...,...,...,...
17251,1,2,4,5,6,7,8,9,10,11
17252,1,2,4,5,6,7,8,9,10,11
17253,1,2,4,5,6,7,8,9,10,11
17254,1,2,4,5,6,7,8,9,10,11
