### Recommendation System CF using 

In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats.stats import pearsonr
import numpy as np
import pandas as pd
import heapq
from sklearn import preprocessing
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

In [2]:
spark = SparkSession.builder.appName("Charles").getOrCreate()
df = spark.read.csv('./data/train.dat', header=True)

In [3]:
data_schema = [StructField('UserID',IntegerType(),True),
              StructField('ItemID',IntegerType(),True),
              StructField('Rating',IntegerType(),True)]
final_structure = StructType(fields=data_schema)
df = spark.read.csv('./data/train.dat', header=True, schema=final_structure)

In [5]:
print("Unique users: {}".format(df.toPandas()['UserID'].nunique()))
print("Unique items: {}".format(df.toPandas()['ItemID'].nunique()))

Unique users: 943
Unique items: 1659


In [6]:
pivotDF = df.groupBy("ItemID").pivot("UserID").avg("Rating")
pivotDF = pivotDF.fillna(0)
pd_ = pivotDF.toPandas()
pd_ = pd_.set_index('ItemID')
pd_

Unnamed: 0_level_0,57,58,59,60,61,62,63,64,65,66,...,990,991,992,993,994,995,996,997,998,999
ItemID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1645,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
1238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
1342,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
1580,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
1138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
517,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
min_max_scaler = preprocessing.StandardScaler()
x_scaled = min_max_scaler.fit_transform(pd_) #normalised numpy array
# Covert to dataframe
pd_norm = pd.DataFrame(x_scaled, columns=pd_.columns)

In [8]:
pd_norm

Unnamed: 0,57,58,59,60,61,62,63,64,65,66,...,990,991,992,993,994,995,996,997,998,999
0,5.627404,-0.175095,-0.08391,-0.188937,-0.134393,-0.176035,-0.120238,-0.21387,-0.103169,-0.243348,...,-0.322346,-0.107837,6.944980,-0.490589,-0.341386,-0.283335,-0.105240,-0.160207,-0.181468,3.274580
1,-0.222867,-0.175095,-0.08391,-0.188937,-0.134393,-0.176035,-0.120238,-0.21387,-0.103169,-0.243348,...,2.082927,-0.107837,-0.170708,-0.490589,3.810813,-0.283335,-0.105240,-0.160207,-0.181468,-0.389610
2,-0.222867,-0.175095,-0.08391,1.289587,-0.134393,-0.176035,-0.120238,-0.21387,-0.103169,-0.243348,...,-0.322346,-0.107837,-0.170708,-0.490589,-0.341386,-0.283335,4.153137,-0.160207,-0.181468,-0.389610
3,-0.222867,-0.175095,-0.08391,-0.188937,-0.134393,-0.176035,-0.120238,-0.21387,-0.103169,-0.243348,...,-0.322346,-0.107837,-0.170708,-0.490589,-0.341386,-0.283335,-0.105240,-0.160207,-0.181468,0.343228
4,-0.222867,-0.175095,-0.08391,-0.188937,-0.134393,-0.176035,-0.120238,-0.21387,-0.103169,-0.243348,...,-0.322346,-0.107837,-0.170708,-0.490589,-0.341386,-0.283335,-0.105240,-0.160207,-0.181468,-0.389610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1654,-0.222867,-0.175095,-0.08391,-0.188937,-0.134393,-0.176035,-0.120238,-0.21387,-0.103169,-0.243348,...,-0.322346,-0.107837,-0.170708,-0.490589,-0.341386,-0.283335,-0.105240,-0.160207,4.160668,-0.389610
1655,-0.222867,-0.175095,-0.08391,-0.188937,-0.134393,-0.176035,-0.120238,-0.21387,-0.103169,-0.243348,...,-0.322346,-0.107837,-0.170708,2.488501,-0.341386,-0.283335,-0.105240,-0.160207,-0.181468,-0.389610
1656,-0.222867,-0.175095,-0.08391,-0.188937,-0.134393,-0.176035,-0.120238,-0.21387,-0.103169,-0.243348,...,-0.322346,-0.107837,-0.170708,-0.490589,-0.341386,-0.283335,-0.105240,-0.160207,-0.181468,-0.389610
1657,-0.222867,-0.175095,-0.08391,-0.188937,-0.134393,-0.176035,-0.120238,-0.21387,-0.103169,-0.243348,...,-0.322346,-0.107837,-0.170708,-0.490589,-0.341386,-0.283335,-0.105240,-0.160207,-0.181468,-0.389610


In [9]:
def get_recoms(df, topn):
    item_factors = df.values
    item_mappings = dict(zip(np.arange(pd_.shape[0]), df.index.values))
    num = topn
    recoms = {}

    for i, item_factor in enumerate(item_factors):
        # user vector
        user_vector = item_factor.reshape(-1,1)
        itemid = item_mappings[i]

        # cosine similarity
        distance = cosine_similarity(item_factors, user_vector.T )
    
        norms = np.sqrt(np.linalg.norm(item_factors, axis=1).reshape(-1, 1))
        dist = np.multiply(distance, norms).reshape(-1)
        
        # closest ids
        close_idx = heapq.nlargest(num+1, range(len(dist)), dist.take)
        close_pids = np.vectorize(item_mappings.__getitem__)(close_idx)
        close_pids = close_pids[(close_pids != itemid)]
        close_pids = close_pids.tolist()

        distance = distance.tolist()

        similarity_measure = []
        for x in close_idx[1::]:
            similarity_measure.append(distance[x])
        sim_list = [item for sublist in similarity_measure for item in sublist]
        recoms[itemid] = [close_pids, sim_list]
        
    return recoms
similar_items = get_recoms(pd_, 3)

In [10]:
test = spark.read.csv('./data/test.dat', header=True, inferSchema=True)

In [11]:
df_pd = df.toPandas()
test_pd = test.toPandas()

User_meanratings = df_pd.groupby('UserID', as_index=False)['Rating'].mean()
Item_meanratings = df_pd.groupby('ItemID', as_index=False)['Rating'].mean()
mean_rating = df_pd.Rating.mean()
print("Mean rating of all movies {}".format(mean_rating))

Mean rating of all movies 3.5303532266343147


In [12]:
def get_ratings(userid, closest_items):
    rating = []
    for i in range(len(closest_items)):
        try:
            item_rating = df_pd.loc[(df_pd['UserID'] == userid) & (df_pd['ItemID'] == closest_items[i]), 'Rating'].iloc[0]
        except:
            item_rating = Item_meanratings.loc[Item_meanratings['ItemID'] == closest_items[i], 'Rating'].iloc[0]
        rating.append(item_rating)
    return rating

def compute_ratings(similarity_scores, all_ratings):
    weighted_sum = 0
    for i in range(len(similarity_scores)):
        weighted_sum += similarity_scores[i]*all_ratings[i]
        value = weighted_sum/sum(similarity_scores)
    return value

In [13]:
def predictRating():
    predictions = []
    avg_user_rating, avg_item_rating = 0, 0

    for i, row in test_pd.iterrows():
        try:
            avg_user_rating = User_meanratings.loc[User_meanratings['UserID'] == row['UserID'], 'Rating'].iloc[0]
            closest_items = similar_items[row['ItemID']][0]
            similarity_scores = similar_items[row['ItemID']][1]
            
            avg_item_rating = Item_meanratings.loc[Item_meanratings['ItemID'] == closest_items[i], 'Rating'].iloc[0]
            item_topredict_avg_rating = Item_meanratings.loc[Item_meanratings['ItemID'] == row['ItemID'], 'Rating'].iloc[0]

            userid = row['UserID']
            # to get the user's ratings of the similar items
            all_ratings = get_ratings(userid, closest_items)
            # compute the rating
            predicted_rating = compute_ratings(similarity_scores, all_ratings)

        except:
            # if the item does not exist in the train data. Return the avg rating of the user
            predicted_rating = avg_user_rating
        # if the user has not rated the similar items then average rating of the item to be predicted
        if predicted_rating == 0:
            predicted_rating = item_topredict_avg_rating

        predictions.append(predicted_rating)
    return predictions

predictions = predictRating()

In [14]:
predictions

[3.817777778279101,
 4.342535621290238,
 4.359766922747771,
 3.206896551724138,
 4.0,
 3.920245398773006,
 3.0114942528735633,
 3.130434782608696,
 2.735632183908046,
 3.340659340659341,
 3.72,
 2.7107438016528924,
 4.004424778761062,
 3.8045977011494254,
 3.0338345864661656,
 2.473684210526316,
 3.8,
 3.3656716417910446,
 3.8659217877094973,
 3.391891891891892,
 3.581151832460733,
 3.248407643312102,
 3.597122302158273,
 2.4210526315789473,
 3.533333333333333,
 3.4216867469879517,
 3.375,
 3.3759398496240602,
 3.6133333333333333,
 3.4242424242424243,
 3.75,
 3.1143695014662756,
 3.0035714285714286,
 3.340659340659341,
 3.3962264150943398,
 3.6052631578947367,
 3.9923664122137406,
 3.881720430107527,
 3.7976190476190474,
 3.662037037037037,
 3.7791411042944785,
 3.4879227053140096,
 3.7491749174917492,
 4.10752688172043,
 3.4390243902439024,
 3.465686274509804,
 3.5344827586206895,
 2.7107438016528924,
 4.135483870967742,
 4.222222222222222,
 3.0,
 3.1196172248803826,
 3.83157894736842