### Recommendation System to compare with BSI 
This notebook provides items to recommend for given user

In [1]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import heapq
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, IntegerType, StructType, FloatType

In [2]:
spark = SparkSession.builder.appName("Charles").getOrCreate()

In [4]:
data_schema = [StructField('userId',IntegerType(),True),
              StructField('movieId',IntegerType(),True),
              StructField('rating',FloatType(),True)]
final_structure = StructType(fields=data_schema)
data = spark.read.csv('data/ratings_movie (1).csv', header=True, schema=final_structure)

In [5]:
print("Unique users: {}".format(data.toPandas()['userId'].nunique()))
print("Unique items: {}".format(data.toPandas()['movieId'].nunique()))

Unique users: 610
Unique items: 9724


In [19]:
pivot_data = data.groupBy("movieId").pivot("userId").avg("rating")
pivot_data = pivot_data.fillna(0)
utility_matrix = pivot_data.toPandas()
utility_matrix = utility_matrix.set_index('movieId')

In [31]:
def get_similar_users(df, TOP_K, USER_ID):
    user_factors = df.values.T
    user_mappings = dict(zip(np.arange(df.shape[1]), df.columns))

    user_vector = user_factors[USER_ID-1].reshape(-1,1)
    distance = cosine_similarity(user_factors, user_vector.T)

    norms = np.sqrt(np.linalg.norm(user_factors, axis=1).reshape(-1, 1))
    dist = np.multiply(distance, norms).reshape(-1)

    close_idx = heapq.nlargest(TOP_K+1, range(len(dist)), dist.take)
    close_pids = np.vectorize(user_mappings.__getitem__)(close_idx)
    return close_pids[1:]

def get_similar_items(users, utility_matrix, RATING_THRESHOLD, TOP_K_THRESHOLD):
    matrix = []
    similar_items = []
    item_mapping = dict(zip(np.arange(utility_matrix.shape[0]), utility_matrix.index.values))
    
    for user in users:
        matrix.append(utility_matrix.values.T[int(user)-1])

    matrix = np.array(matrix).T
    
    for i in range(len(matrix)):
        count = 0
        for j in range(len(matrix[0])):
            if matrix[i][j] > RATING_THRESHOLD:
                count += 1
        if count >= TOP_K_THRESHOLD:   
            similar_items.append(item_mapping[i])
    return similar_items

In [34]:
USER_ID = 1
TOP_K = 10
TOP_K_THRESHOLD = 7 # In 0 - TOP_K, select ratings from n user
RATING_THRESHOLD = 3 # In 0 - 5 rating scale, select rating you want

similar_user = get_similar_users(utility_matrix, TOP_K, USER_ID)
similar_item = get_similar_items(similar_user, utility_matrix, RATING_THRESHOLD, TOP_K_THRESHOLD)

print("User ID:", USER_ID)
print("Similar User:", similar_user)
print("Recommended Items:", similar_item)

User ID: 1
Similar User: ['414' '288' '599' '480' '590' '91' '274' '380' '57' '469']
Recommended Items: [1580, 1127, 858, 1270, 1265, 588, 296, 2396, 593, 1653, 1198, 8961, 1201, 596, 1276, 1210, 2804, 1148, 1208, 2502, 1291, 924, 2791, 111, 1197, 47, 1206, 912, 1266, 1304, 1, 1682, 1089, 1242, 4993, 923, 1517, 1028, 4226, 4027, 2571, 1222, 1220, 1136, 6333, 1732, 1240, 1527, 3471, 318, 3671, 2403, 3499, 594, 2716, 2947, 3740, 6874, 2174, 1036, 2959, 2918, 1200, 2951, 1234, 7438, 589, 750, 1214, 50, 527, 293, 364, 356, 1213, 2529, 480, 1221, 1079, 8636, 3081, 32, 457, 608, 4306, 4963, 1617, 5952, 1196, 3362, 110, 541, 260, 2028, 1080, 2762, 1954, 2115, 1193, 3578, 1090, 2997, 551, 1278, 32587, 1097, 1258, 1387]
