In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import pickle
from statistics import mean 
from sklearn.cluster import SpectralClustering
from nltk.cluster import KMeansClusterer, cosine_distance
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
!pip install pyamg
import warnings
warnings.filterwarnings('ignore')

Collecting pyamg
[?25l  Downloading https://files.pythonhosted.org/packages/28/a5/cffa0f1ca92f0b145f0989212a559ba616911354bf9551070954d7c83166/pyamg-4.1.0.tar.gz (749kB)
[K     |▍                               | 10kB 14.8MB/s eta 0:00:01[K     |▉                               | 20kB 14.2MB/s eta 0:00:01[K     |█▎                              | 30kB 10.4MB/s eta 0:00:01[K     |█▊                              | 40kB 8.8MB/s eta 0:00:01[K     |██▏                             | 51kB 5.5MB/s eta 0:00:01[K     |██▋                             | 61kB 5.4MB/s eta 0:00:01[K     |███                             | 71kB 6.1MB/s eta 0:00:01[K     |███▌                            | 81kB 6.8MB/s eta 0:00:01[K     |████                            | 92kB 6.7MB/s eta 0:00:01[K     |████▍                           | 102kB 7.1MB/s eta 0:00:01[K     |████▉                           | 112kB 7.1MB/s eta 0:00:01[K     |█████▎                          | 122kB 7.1MB/s eta 0:00:01[K   

In [3]:
users_df = pd.read_csv("/content/drive/MyDrive/CF/ml-100k/u.user",sep="|",header=None,names=("user id","age","gender","occupation","zip code"),index_col=False)
occupations_df = pd.read_csv("/content/drive/MyDrive/CF/ml-100k/u.occupation",header=None)
genre_df = pd.read_csv("/content/drive/MyDrive/CF/ml-100k/u.genre",sep="|",header=None,names=("genre","genre id"),index_col=False)
movies_df = pd.read_csv("/content/drive/MyDrive/CF/ml-100k/u.item",sep='|',header=None,names=('item id','movie name','release date','video release date',
              'IMDb URL','unknown','Action','Adventure','Animation',
              'Children\'s','Comedy','Crime','Documentary','Drama','Fantasy',
              'Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi',
              'Thriller','War','Western'),encoding='unicode_escape',index_col=False)

In [4]:
def gen_folds(users,num_folds):
  user_count = users.shape[0]
  user_perfold = int(user_count / num_folds)
  user_ids = users.index.values
  folds_list = []
  for _ in range(num_folds):
    fold = np.random.choice(user_ids,user_perfold,replace=False)
    user_ids = np.setdiff1d(user_ids,fold)
    folds_list.append(fold.tolist())
  return folds_list


In [5]:
folds_list = gen_folds(users_df,5)

filename = "/content/drive/MyDrive/CF/" + "/cf_ass2_folds_list"
filedir = open(filename,"wb")
pickle.dump(folds_list,filedir)
filedir.close()

In [6]:
filename = "/content/drive/MyDrive/CF/" + "/cf_ass2_folds_list"
filedir = open(filename,"rb")
folds_list = pickle.load(filedir)
filedir.close()

In [7]:
ord_enc = OrdinalEncoder()
users_df_encoded = users_df.copy()
users_df_encoded["gender"] = ord_enc.fit_transform(users_df[["gender"]])
users_df_encoded["occupation"] = ord_enc.fit_transform(users_df[["occupation"]])
users_df_encoded["zip code"] = ord_enc.fit_transform(users_df[["zip code"]])

In [8]:
data_df = pd.read_csv('/content/drive/MyDrive/CF/ml-100k/u.data',sep='\t',header=None,names=('user id','item id','rating','timestamp'),index_col=False)

In [9]:
users = set(data_df['user id'].tolist())
user_dict = dict.fromkeys(users,None)
added_users = []
genre_ratings = []

for index,row in data_df.iterrows():
  user = row['user id']
  if user in added_users:
    continue
  else:
    added_users.append(user)
  genre_list = genre_df["genre"]
  genre_rating_freqs = dict.fromkeys(genre_list, 0)
  genre_rating_cummulative = dict.fromkeys(genre_list,0)
  genre_rating_average = dict.fromkeys(genre_list,0)
  user_df = data_df.loc[data_df['user id'] == user]
  for index,row in user_df.iterrows():
    movie_id = row["item id"]
    user_id = row["user id"]
    rating = row["rating"]
    movie = movies_df.loc[movies_df['item id'] == movie_id]
    for g in genre_list:
      if int(movie[g])==1:
        genre_rating_cummulative[g]+=rating
        genre_rating_freqs[g]+=1
  for g in genre_list:
    if genre_rating_freqs[g]>0:
      genre_rating_average[g] = genre_rating_cummulative[g] / genre_rating_freqs[g]
  user_dict[user] = genre_rating_average

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
filename = "/content/drive/MyDrive/CF/" + "/cf_ass2_user_dict"
filedir = open(filename,"rb")
user_dict = pickle.load(filedir)
filedir.close()
user_dict_df = pd.DataFrame(user_dict)
user_dict_df = user_dict_df.T
user_dict_df['user id'] = user_dict_df.index
combined_user_df = users_df_encoded.merge(user_dict_df, on="user id", how = 'inner')
users_df_encoded =  combined_user_df.copy()


In [17]:
'''
Technique 1 -KMeans with Euclidean Distance
'''


fold_wise_mae = []
for fold in folds_list:
  train_df = users_df_encoded.loc[~users_df_encoded['user id'].isin(fold)]
  test_df = users_df_encoded.loc[users_df_encoded['user id'].isin(fold)]

  train_ratings = data_df.loc[~data_df['user id'].isin(fold)]
  test_ratings = data_df.loc[data_df['user id'].isin(fold)]



  pivot_table_df = train_ratings.pivot_table(values='rating',columns='item id',index='user id')
  pivot_table_df_nn = pivot_table_df.fillna(-1)


  scale = StandardScaler()
  X_scaled = scale.fit_transform(train_df)
  Xtest_scaled = scale.fit_transform(test_df)


  default_rating = 3.5


  kmeans = KMeans(n_clusters=12,verbose=False)
  kmeans.fit(X_scaled)


  preds = kmeans.predict(X_scaled)

  pca = PCA(n_components=2)
  X_pca = pca.fit_transform(X_scaled)

  train_df['cluster'] = preds
  

  test_preds = kmeans.predict(Xtest_scaled)
  test_df['cluster'] = test_preds

  test_mae = []
  for index, row in test_ratings.iterrows():

    test_user = row['user id']
    test_movie = row['item id']
    test_rating = row['rating']

    test_cluster = int(test_df.loc[test_df['user id'] == test_user]["cluster"])

    family_users = train_df.loc[train_df['cluster'] == test_cluster]

    family_users_ids = family_users['user id'].tolist()
    count_fam = 0
    cum_fam = 0
    for f_user in family_users_ids:
      if test_movie in pivot_table_df.columns:
 
        if pivot_table_df[test_movie][f_user] > -1:
          count_fam+=1
          cum_fam+=pivot_table_df[test_movie][f_user]
    if count_fam != 0:
      average_rating = cum_fam / count_fam
    else:
      average_rating = default_rating
    error = abs(average_rating - test_rating)
    test_mae.append(error)

  mae = mean(test_mae)
  fold_wise_mae.append(mae)

print("FOLD WISE MAE",fold_wise_mae)

FOLD WISE MAE [0.813804650958729, 0.8323435909722634, 0.8025621413677342, 0.8314086524322438, 0.7883115016055947]


In [18]:
'''
Technique 2 -KMeans++ with Euclidean Distance

'''

fold_wise_mae = []
for fold in folds_list:
  train_df = users_df_encoded.loc[~users_df_encoded['user id'].isin(fold)]
  test_df = users_df_encoded.loc[users_df_encoded['user id'].isin(fold)]

  train_ratings = data_df.loc[~data_df['user id'].isin(fold)]
  test_ratings = data_df.loc[data_df['user id'].isin(fold)]



  pivot_table_df = train_ratings.pivot_table(values='rating',columns='item id',index='user id')
  pivot_table_df_nn = pivot_table_df.fillna(-1)


  scale = StandardScaler()
  X_scaled = scale.fit_transform(train_df)
  Xtest_scaled = scale.fit_transform(test_df)



  default_rating = 3.5


  kmeans = KMeans(n_clusters=14,verbose=False,init='k-means++')
  kmeans.fit(X_scaled)


  preds = kmeans.predict(X_scaled)


  pca = PCA(n_components=2)
  X_pca = pca.fit_transform(X_scaled)

  train_df['cluster'] = preds


  


  test_preds = kmeans.predict(Xtest_scaled)
  test_df['cluster'] = test_preds

  test_mae = []
  for index, row in test_ratings.iterrows():

    test_user = row['user id']
    test_movie = row['item id']
    test_rating = row['rating']
    test_cluster = int(test_df.loc[test_df['user id'] == test_user]["cluster"])

    family_users = train_df.loc[train_df['cluster'] == test_cluster]

    family_users_ids = family_users['user id'].tolist()
    count_fam = 0
    cum_fam = 0
    for f_user in family_users_ids:
      if test_movie in pivot_table_df.columns:

        if pivot_table_df[test_movie][f_user] > -1:
          count_fam+=1
          cum_fam+=pivot_table_df[test_movie][f_user]
    if count_fam != 0:
      average_rating = cum_fam / count_fam
    else:
      average_rating = default_rating
    error = abs(average_rating - test_rating)
    test_mae.append(error)

  mae = mean(test_mae)
  fold_wise_mae.append(mae)

print("FOLD WISE MAE",fold_wise_mae)

FOLD WISE MAE [0.8107732719305643, 0.835159211214934, 0.7955377989931669, 0.8322455095753855, 0.791495887081952]


In [19]:
'''
TECHNIQUE 3:
Algorithm: KMeans
Distance Measure: Cosine Distance

'''

fold_wise_mae = []
for fold in folds_list:
  train_df = users_df_encoded.loc[~users_df_encoded['user id'].isin(fold)]
  test_df = users_df_encoded.loc[users_df_encoded['user id'].isin(fold)]

  train_ratings = data_df.loc[~data_df['user id'].isin(fold)]
  test_ratings = data_df.loc[data_df['user id'].isin(fold)]


  pivot_table_df = train_ratings.pivot_table(values='rating',columns='item id',index='user id')
  pivot_table_df_nn = pivot_table_df.fillna(-1)


  scale = StandardScaler()
  X_scaled = scale.fit_transform(train_df)
  Xtest_scaled = scale.fit_transform(test_df)

  default_rating = 3.5


  clusterer = KMeansClusterer(12, cosine_distance, repeats=5)
  preds = clusterer.cluster(X_scaled, True)


  pca = PCA(n_components=2)
  X_pca = pca.fit_transform(X_scaled)

  train_df['cluster'] = preds


  
  test_preds = clusterer.cluster(Xtest_scaled, True)
  test_df['cluster'] = test_preds

  test_mae = []
  for index, row in test_ratings.iterrows():

    test_user = row['user id']
    test_movie = row['item id']
    test_rating = row['rating']

    test_cluster = int(test_df.loc[test_df['user id'] == test_user]["cluster"])

    family_users = train_df.loc[train_df['cluster'] == test_cluster]

    family_users_ids = family_users['user id'].tolist()
    count_fam = 0
    cum_fam = 0
    for f_user in family_users_ids:
      if test_movie in pivot_table_df.columns:

        if pivot_table_df[test_movie][f_user] > -1:
          count_fam+=1
          cum_fam+=pivot_table_df[test_movie][f_user]
    if count_fam != 0:
      average_rating = cum_fam / count_fam
    else:
      average_rating = default_rating
    error = abs(average_rating - test_rating)
    test_mae.append(error)

  mae = mean(test_mae)
  fold_wise_mae.append(mae)

print("FOLD WISE MAE",fold_wise_mae)

FOLD WISE MAE [0.8399202774491609, 0.8732371135491324, 0.8566691945437009, 0.8755805937804125, 0.8254191268736647]


In [20]:
'''
Technique 4 -Spectral Clustering with eigen_solver=arpack
'''

fold_wise_mae = []
for fold in folds_list:
  
  train_df = users_df_encoded.loc[~users_df_encoded['user id'].isin(fold)]
  test_df = users_df_encoded.loc[users_df_encoded['user id'].isin(fold)]

  train_ratings = data_df.loc[~data_df['user id'].isin(fold)]
  test_ratings = data_df.loc[data_df['user id'].isin(fold)]


  pivot_table_df = train_ratings.pivot_table(values='rating',columns='item id',index='user id')
  pivot_table_df_nn = pivot_table_df.fillna(-1)


  scale = StandardScaler()
  X_scaled = scale.fit_transform(train_df)
  Xtest_scaled = scale.fit_transform(test_df)


  default_rating = 3.5


  clustering = SpectralClustering(n_clusters=14,assign_labels="discretize").fit(X_scaled)


  preds = clustering.labels_


  pca = PCA(n_components=2)
  X_pca = pca.fit_transform(X_scaled)

  train_df['cluster'] = preds

  

  
  clustering.fit(Xtest_scaled)
  test_preds = clustering.labels_
  test_df['cluster'] = test_preds
  test_mae = []
  for index, row in test_ratings.iterrows():

    test_user = row['user id']
    test_movie = row['item id']
    test_rating = row['rating']

    test_cluster = int(test_df.loc[test_df['user id'] == test_user]["cluster"])

    family_users = train_df.loc[train_df['cluster'] == test_cluster]

    family_users_ids = family_users['user id'].tolist()
    count_fam = 0
    cum_fam = 0
    for f_user in family_users_ids:
      if test_movie in pivot_table_df.columns:
        if pivot_table_df[test_movie][f_user] > -1:
          count_fam+=1
          cum_fam+=pivot_table_df[test_movie][f_user]
    if count_fam != 0:
      average_rating = cum_fam / count_fam
    else:
      average_rating = default_rating
    error = abs(average_rating - test_rating)
    test_mae.append(error)

  

  mae = mean(test_mae)
  fold_wise_mae.append(mae)

print("FOLD WISE MAE",fold_wise_mae)

FOLD WISE MAE [0.9698475515669187, 0.9854951338891925, 0.8900400069803968, 0.9607069844469714, 0.9468070457831682]


In [21]:
'''
Technique 5 -MiniBatch KMeans 
'''


fold_wise_mae = []
for fold in folds_list:
  train_df = users_df_encoded.loc[~users_df_encoded['user id'].isin(fold)]
  test_df = users_df_encoded.loc[users_df_encoded['user id'].isin(fold)]

  train_ratings = data_df.loc[~data_df['user id'].isin(fold)]
  test_ratings = data_df.loc[data_df['user id'].isin(fold)]


  pivot_table_df = train_ratings.pivot_table(values='rating',columns='item id',index='user id')
  pivot_table_df_nn = pivot_table_df.fillna(-1)


  scale = StandardScaler()
  X_scaled = scale.fit_transform(train_df)
  Xtest_scaled = scale.fit_transform(test_df)


  default_rating = 3.5


  kmeans = MiniBatchKMeans(n_clusters=14,batch_size=6)
  kmeans.fit(X_scaled)



  preds = kmeans.predict(X_scaled)


  pca = PCA(n_components=2)
  X_pca = pca.fit_transform(X_scaled)

  train_df['cluster'] = preds



  

  

  test_preds = kmeans.predict(Xtest_scaled)
  test_df['cluster'] = test_preds

  test_mae = []
  for index, row in test_ratings.iterrows():

    test_user = row['user id']
    test_movie = row['item id']
    test_rating = row['rating']

    test_cluster = int(test_df.loc[test_df['user id'] == test_user]["cluster"])

    family_users = train_df.loc[train_df['cluster'] == test_cluster]

    family_users_ids = family_users['user id'].tolist()
    count_fam = 0
    cum_fam = 0
    for f_user in family_users_ids:
      if test_movie in pivot_table_df.columns:
        if pivot_table_df[test_movie][f_user] > -1:
          count_fam+=1
          cum_fam+=pivot_table_df[test_movie][f_user]
    if count_fam != 0:
      average_rating = cum_fam / count_fam
    else:
      average_rating = default_rating
    error = abs(average_rating - test_rating)
    test_mae.append(error)

  
  mae = mean(test_mae)
  fold_wise_mae.append(mae)

print("FOLD WISE MAE",fold_wise_mae)

FOLD WISE MAE [0.8140023388898382, 0.8291831459741974, 0.7971730250735681, 0.8190424382206846, 0.7984012631374358]


In [14]:
'''
Technique 6 -Affinity Propagation with Affinity: Euclidean
'''


fold_wise_mae = []
for fold in folds_list:
  train_df = users_df_encoded.loc[~users_df_encoded['user id'].isin(fold)]
  test_df = users_df_encoded.loc[users_df_encoded['user id'].isin(fold)]

  train_ratings = data_df.loc[~data_df['user id'].isin(fold)]
  test_ratings = data_df.loc[data_df['user id'].isin(fold)]


  pivot_table_df = train_ratings.pivot_table(values='rating',columns='item id',index='user id')
  pivot_table_df_nn = pivot_table_df.fillna(-1)


  scale = StandardScaler()
  X_scaled = scale.fit_transform(train_df)
  Xtest_scaled = scale.fit_transform(test_df)

  default_rating = 3.5

  clustering = AffinityPropagation().fit(X_scaled)


  preds = clustering.predict(X_scaled)

  pca = PCA(n_components=2)
  X_pca = pca.fit_transform(X_scaled)

  train_df['cluster'] = preds



  

  test_preds = clustering.predict(Xtest_scaled)
  test_df['cluster'] = test_preds
  test_mae = []
  for index, row in test_ratings.iterrows():

    test_user = row['user id']
    test_movie = row['item id']
    test_rating = row['rating']
    test_cluster = int(test_df.loc[test_df['user id'] == test_user]["cluster"])

    family_users = train_df.loc[train_df['cluster'] == test_cluster]

    family_users_ids = family_users['user id'].tolist()
    count_fam = 0
    cum_fam = 0
    for f_user in family_users_ids:
      if test_movie in pivot_table_df.columns:
   
        if pivot_table_df[test_movie][f_user] > -1:
          count_fam+=1
          cum_fam+=pivot_table_df[test_movie][f_user]
    if count_fam != 0:
      average_rating = cum_fam / count_fam
    else:
      average_rating = default_rating
    error = abs(average_rating - test_rating)
    test_mae.append(error)
  
  mae = mean(test_mae)
  fold_wise_mae.append(mae)

print("FOLD WISE MAE",fold_wise_mae)

FOLD WISE MAE [0.8991114648151045, 0.8683248168609337, 0.8502429760903342, 0.791191020333896, 0.8358304003630737]


In [23]:
'''
Technique 7-DBSCAN
'''

fold_wise_mae = []
for fold in folds_list:
  train_df = users_df_encoded.loc[~users_df_encoded['user id'].isin(fold)]
  test_df = users_df_encoded.loc[users_df_encoded['user id'].isin(fold)]

  train_ratings = data_df.loc[~data_df['user id'].isin(fold)]
  test_ratings = data_df.loc[data_df['user id'].isin(fold)]



  pivot_table_df = train_ratings.pivot_table(values='rating',columns='item id',index='user id')
  pivot_table_df_nn = pivot_table_df.fillna(-1)


  scale = StandardScaler()
  X_scaled = scale.fit_transform(train_df)
  Xtest_scaled = scale.fit_transform(test_df)


  default_rating = 3.5

  clustering = SpectralClustering(n_clusters=14,assign_labels="discretize",eigen_solver='amg').fit(X_scaled)

  preds = clustering.labels_

  pca = PCA(n_components=2)
  X_pca = pca.fit_transform(X_scaled)

  train_df['cluster'] = preds


  
  
  clustering.fit(Xtest_scaled)
  test_preds = clustering.labels_
  test_df['cluster'] = test_preds

  test_mae = []
  for index, row in test_ratings.iterrows():
    test_user = row['user id']
    test_movie = row['item id']
    test_rating = row['rating']

    test_cluster = int(test_df.loc[test_df['user id'] == test_user]["cluster"])


    family_users = train_df.loc[train_df['cluster'] == test_cluster]

    family_users_ids = family_users['user id'].tolist()
    count_fam = 0
    cum_fam = 0
    for f_user in family_users_ids:
      if test_movie in pivot_table_df.columns:
        if pivot_table_df[test_movie][f_user] > -1:
          count_fam+=1
          cum_fam+=pivot_table_df[test_movie][f_user]
    if count_fam != 0:
      average_rating = cum_fam / count_fam
    else:
      average_rating = default_rating
    error = abs(average_rating - test_rating)
    test_mae.append(error)

  

  mae = mean(test_mae)
  fold_wise_mae.append(mae)

print("FOLD WISE MAE",fold_wise_mae)

FOLD WISE MAE [0.9644043036055877, 1.1168826567763779, 0.946274901251775, 0.9673690360319873, 0.9821405833003015]


In [24]:
''' 
Technique 8 -Spectral Clustering with affinity=‘rbf’ and eigen_solver=amg

'''

fold_wise_mae = []
for fold in folds_list:

  train_df = users_df_encoded.loc[~users_df_encoded['user id'].isin(fold)]
  test_df = users_df_encoded.loc[users_df_encoded['user id'].isin(fold)]

  train_ratings = data_df.loc[~data_df['user id'].isin(fold)]
  test_ratings = data_df.loc[data_df['user id'].isin(fold)]


  pivot_table_df = train_ratings.pivot_table(values='rating',columns='item id',index='user id')
  pivot_table_df_nn = pivot_table_df.fillna(0)



  scale = StandardScaler()
  X_scaled = scale.fit_transform(train_df)
  Xtest_scaled = scale.fit_transform(test_df)




  default_rating = 3.5


  clustering = SpectralClustering(n_clusters=14,assign_labels="discretize",affinity='rbf',eigen_solver='amg').fit(X_scaled)


  preds = clustering.labels_


  pca = PCA(n_components=2)
  X_pca = pca.fit_transform(X_scaled)

  train_df['cluster'] = preds


  
  
  clustering.fit(Xtest_scaled)
  test_preds = clustering.labels_
  test_df['cluster'] = test_preds

  test_mae = []
  for index, row in test_ratings.iterrows():

    test_user = row['user id']
    test_movie = row['item id']
    test_rating = row['rating']

    test_cluster = int(test_df.loc[test_df['user id'] == test_user]["cluster"])
 
    family_users = train_df.loc[train_df['cluster'] == test_cluster]
 
    family_users_ids = family_users['user id'].tolist()
    count_fam = 0
    cum_fam = 0
    for f_user in family_users_ids:
      if test_movie in pivot_table_df.columns:

        if pivot_table_df[test_movie][f_user] > -1:
          count_fam+=1
          cum_fam+=pivot_table_df[test_movie][f_user]
    if count_fam != 0:
      average_rating = cum_fam / count_fam
    else:
      average_rating = default_rating
    error = abs(average_rating - test_rating)
    test_mae.append(error)

  
  mae = mean(test_mae)
  fold_wise_mae.append(mae)

print("FOLD WISE MAE",fold_wise_mae)

FOLD WISE MAE [0.9599135569084977, 1.034979488554565, 0.8530972327203165, 0.9467892257383255, 0.9307938618414672]


In [25]:
''' 
Technique 9 -Spectral Clustering with affinity=‘rbf’and eigen_solver=arpack

'''

fold_wise_mae = []
for fold in folds_list:
  train_df = users_df_encoded.loc[~users_df_encoded['user id'].isin(fold)]
  test_df = users_df_encoded.loc[users_df_encoded['user id'].isin(fold)]

  train_ratings = data_df.loc[~data_df['user id'].isin(fold)]
  test_ratings = data_df.loc[data_df['user id'].isin(fold)]


  pivot_table_df = train_ratings.pivot_table(values='rating',columns='item id',index='user id')
  pivot_table_df_nn = pivot_table_df.fillna(0)


  scale = StandardScaler()
  X_scaled = scale.fit_transform(train_df)
  Xtest_scaled = scale.fit_transform(test_df)



  default_rating = 3.5

  clustering = SpectralClustering(n_clusters=14,assign_labels="discretize",affinity='rbf',eigen_solver='arpack').fit(X_scaled)

  preds = clustering.labels_

  pca = PCA(n_components=2)
  X_pca = pca.fit_transform(X_scaled)

  train_df['cluster'] = preds



  
  
  clustering.fit(Xtest_scaled)
  test_preds = clustering.labels_
  test_df['cluster'] = test_preds

  test_mae = []
  for index, row in test_ratings.iterrows():
    test_user = row['user id']
    test_movie = row['item id']
    test_rating = row['rating']
    
    test_cluster = int(test_df.loc[test_df['user id'] == test_user]["cluster"])


    family_users = train_df.loc[train_df['cluster'] == test_cluster]

    family_users_ids = family_users['user id'].tolist()
    count_fam = 0
    cum_fam = 0
    for f_user in family_users_ids:
      if test_movie in pivot_table_df.columns:
        
        if pivot_table_df[test_movie][f_user] > -1:
          count_fam+=1
          cum_fam+=pivot_table_df[test_movie][f_user]
    if count_fam != 0:
      average_rating = cum_fam / count_fam
    else:
      average_rating = default_rating
    error = abs(average_rating - test_rating)
    test_mae.append(error)

  

  mae = mean(test_mae)
  fold_wise_mae.append(mae)

print("FOLD WISE MAE",fold_wise_mae)

FOLD WISE MAE [0.9697970225741458, 1.0608266036774663, 1.0093419585317152, 0.9420039176974044, 1.057755428389918]


In [26]:
''' 
Technique 10 -Spectral Clustering with affinity=‘laplacian’ and eigen_solver=arpack
'''

fold_wise_mae = []
for fold in folds_list:
  train_df = users_df_encoded.loc[~users_df_encoded['user id'].isin(fold)]
  test_df = users_df_encoded.loc[users_df_encoded['user id'].isin(fold)]

  train_ratings = data_df.loc[~data_df['user id'].isin(fold)]
  test_ratings = data_df.loc[data_df['user id'].isin(fold)]



  pivot_table_df = train_ratings.pivot_table(values='rating',columns='item id',index='user id')
  pivot_table_df_nn = pivot_table_df.fillna(0)


  scale = StandardScaler()
  X_scaled = scale.fit_transform(train_df)
  Xtest_scaled = scale.fit_transform(test_df)




  default_rating = 3.5


  clustering = SpectralClustering(n_clusters=14,assign_labels="discretize",affinity='laplacian',eigen_solver='arpack').fit(X_scaled)


  preds = clustering.labels_


  pca = PCA(n_components=2)
  X_pca = pca.fit_transform(X_scaled)

  train_df['cluster'] = preds



  

  
  clustering.fit(Xtest_scaled)
  test_preds = clustering.labels_
  test_df['cluster'] = test_preds

  test_mae = []
  for index, row in test_ratings.iterrows():

    test_user = row['user id']
    test_movie = row['item id']
    test_rating = row['rating']

    test_cluster = int(test_df.loc[test_df['user id'] == test_user]["cluster"])


    family_users = train_df.loc[train_df['cluster'] == test_cluster]

    family_users_ids = family_users['user id'].tolist()
    count_fam = 0
    cum_fam = 0
    for f_user in family_users_ids:
      if test_movie in pivot_table_df.columns:

        if pivot_table_df[test_movie][f_user] > -1:
          count_fam+=1
          cum_fam+=pivot_table_df[test_movie][f_user]
    if count_fam != 0:
      average_rating = cum_fam / count_fam
    else:
      average_rating = default_rating
    error = abs(average_rating - test_rating)
    test_mae.append(error)

  
  mae = mean(test_mae)
  fold_wise_mae.append(mae)

print("FOLD WISE MAE",fold_wise_mae)

FOLD WISE MAE [1.0964462963723651, 0.9518038201374702, 0.9253991953826707, 0.9813854968603333, 0.9450244996295847]


In [27]:
''' 
Technique 11-Spectral Clustering with affinity=‘laplacian’ and eigen_solver=amg

'''

fold_wise_mae = []
for fold in folds_list:
  train_df = users_df_encoded.loc[~users_df_encoded['user id'].isin(fold)]
  test_df = users_df_encoded.loc[users_df_encoded['user id'].isin(fold)]

  train_ratings = data_df.loc[~data_df['user id'].isin(fold)]
  test_ratings = data_df.loc[data_df['user id'].isin(fold)]



  pivot_table_df = train_ratings.pivot_table(values='rating',columns='item id',index='user id')
  pivot_table_df_nn = pivot_table_df.fillna(0)


  scale = StandardScaler()
  X_scaled = scale.fit_transform(train_df)
  Xtest_scaled = scale.fit_transform(test_df)



  default_rating = 3.5

  clustering = SpectralClustering(n_clusters=14,assign_labels="discretize",affinity='laplacian',eigen_solver='amg').fit(X_scaled)


  preds = clustering.labels_

  pca = PCA(n_components=2)
  X_pca = pca.fit_transform(X_scaled)

  train_df['cluster'] = preds



  
  clustering.fit(Xtest_scaled)
  test_preds = clustering.labels_
  test_df['cluster'] = test_preds

  test_mae = []
  for index, row in test_ratings.iterrows():
  
    test_user = row['user id']
    test_movie = row['item id']
    test_rating = row['rating']

    test_cluster = int(test_df.loc[test_df['user id'] == test_user]["cluster"])

    family_users = train_df.loc[train_df['cluster'] == test_cluster]

    family_users_ids = family_users['user id'].tolist()
    count_fam = 0
    cum_fam = 0
    for f_user in family_users_ids:
      if test_movie in pivot_table_df.columns:
      
        if pivot_table_df[test_movie][f_user] > -1:
          count_fam+=1
          cum_fam+=pivot_table_df[test_movie][f_user]
    if count_fam != 0:
      average_rating = cum_fam / count_fam
    else:
      average_rating = default_rating
    error = abs(average_rating - test_rating)
    test_mae.append(error)

  
  mae = mean(test_mae)
  fold_wise_mae.append(mae)

print("FOLD WISE MAE",fold_wise_mae)

FOLD WISE MAE [0.9806919419405222, 1.0136013378097017, 0.9554112386155827, 0.9490965821691021, 0.9320855729695119]


In [11]:
''' 
Technique 12-Spectral Clustering with affinity=‘nearest_neighbors’ and eigen_solver=arpack

'''

fold_wise_mae = []
for fold in folds_list:
  train_df = users_df_encoded.loc[~users_df_encoded['user id'].isin(fold)]
  test_df = users_df_encoded.loc[users_df_encoded['user id'].isin(fold)]

  train_ratings = data_df.loc[~data_df['user id'].isin(fold)]
  test_ratings = data_df.loc[data_df['user id'].isin(fold)]


  pivot_table_df = train_ratings.pivot_table(values='rating',columns='item id',index='user id')
  pivot_table_df_nn = pivot_table_df.fillna(0)

  scale = StandardScaler()
  X_scaled = scale.fit_transform(train_df)
  Xtest_scaled = scale.fit_transform(test_df)


  default_rating = 3.5


  clustering = SpectralClustering(n_clusters=14,assign_labels="discretize",affinity='nearest_neighbors',eigen_solver='arpack').fit(X_scaled)


  preds = clustering.labels_


  pca = PCA(n_components=2)
  X_pca = pca.fit_transform(X_scaled)

  train_df['cluster'] = preds

  
  clustering.fit(Xtest_scaled)
  test_preds = clustering.labels_
  test_df['cluster'] = test_preds



  test_mae = []
  for index, row in test_ratings.iterrows():

    test_user = row['user id']
    test_movie = row['item id']
    test_rating = row['rating']
    test_cluster = int(test_df.loc[test_df['user id'] == test_user]["cluster"])


    family_users = train_df.loc[train_df['cluster'] == test_cluster]

    family_users_ids = family_users['user id'].tolist()
    count_fam = 0
    cum_fam = 0
    for f_user in family_users_ids:
      if test_movie in pivot_table_df.columns:

        if pivot_table_df[test_movie][f_user] > -1:
          count_fam+=1
          cum_fam+=pivot_table_df[test_movie][f_user]
    if count_fam != 0:
      average_rating = cum_fam / count_fam
    else:
      average_rating = default_rating
    error = abs(average_rating - test_rating)
    test_mae.append(error)
  
  mae = mean(test_mae)
  fold_wise_mae.append(mae)

print("FOLD WISE MAE",fold_wise_mae)

FOLD WISE MAE [1.0105053934445523, 0.9579353344578693, 0.9203325082714068, 0.9396898879269633, 0.9318787814932672]


In [31]:
'''
Technique 13-AgglomerativeClustering

'''

fold_wise_mae = []
for fold in folds_list:

  train_df = users_df_encoded.loc[~users_df_encoded['user id'].isin(fold)]
  test_df = users_df_encoded.loc[users_df_encoded['user id'].isin(fold)]

  train_ratings = data_df.loc[~data_df['user id'].isin(fold)]
  test_ratings = data_df.loc[data_df['user id'].isin(fold)]



  pivot_table_df = train_ratings.pivot_table(values='rating',columns='item id',index='user id')
  pivot_table_df_nn = pivot_table_df.fillna(0)



  scale = StandardScaler()
  X_scaled = scale.fit_transform(train_df)
  Xtest_scaled = scale.fit_transform(test_df)




  default_rating = 3.5


  clustering = AgglomerativeClustering(n_clusters=14).fit(X_scaled)

  preds = clustering.labels_


  pca = PCA(n_components=2)
  X_pca = pca.fit_transform(X_scaled)

  train_df['cluster'] = preds


  

  clustering.fit(Xtest_scaled) 
  test_preds =  clustering.labels_
  test_df['cluster'] = test_preds



  test_mae = []
  for index, row in test_ratings.iterrows():

    test_user = row['user id']
    test_movie = row['item id']
    test_rating = row['rating']
    test_cluster = int(test_df.loc[test_df['user id'] == test_user]["cluster"])

    family_users = train_df.loc[train_df['cluster'] == test_cluster]

    family_users_ids = family_users['user id'].tolist()
    count_fam = 0
    cum_fam = 0
    for f_user in family_users_ids:
      if test_movie in pivot_table_df.columns:
        if pivot_table_df[test_movie][f_user] > -1:
          count_fam+=1
          cum_fam+=pivot_table_df[test_movie][f_user]
    if count_fam != 0:
      average_rating = cum_fam / count_fam
    else:
      average_rating = default_rating
    error = abs(average_rating - test_rating)
    test_mae.append(error)

  mae = mean(test_mae)
  fold_wise_mae.append(mae)

print("FOLD WISE MAE",fold_wise_mae)

FOLD WISE MAE [0.9845702749185244, 0.9829342703832048, 1.0636163047242009, 1.003536631916903, 0.9234744670729821]


In [32]:
'''
Technique 14-DBSCAN

'''

fold_wise_mae = []
for fold in folds_list:
  train_df = users_df_encoded.loc[~users_df_encoded['user id'].isin(fold)]
  test_df = users_df_encoded.loc[users_df_encoded['user id'].isin(fold)]

  train_ratings = data_df.loc[~data_df['user id'].isin(fold)]
  test_ratings = data_df.loc[data_df['user id'].isin(fold)]


  pivot_table_df = train_ratings.pivot_table(values='rating',columns='item id',index='user id')
  pivot_table_df_nn = pivot_table_df.fillna(0)


  scale = StandardScaler()
  X_scaled = scale.fit_transform(train_df)
  Xtest_scaled = scale.fit_transform(test_df)


  default_rating = 3.5


  clustering = DBSCAN().fit(X_scaled)
  preds = clustering.labels_

  pca = PCA(n_components=2)
  X_pca = pca.fit_transform(X_scaled)

  train_df['cluster'] = preds


  clustering.fit(Xtest_scaled) 
  test_preds =  clustering.labels_
  test_df['cluster'] = test_preds

  test_mae = []
  for index, row in test_ratings.iterrows():

    test_user = row['user id']
    test_movie = row['item id']
    test_rating = row['rating']

    test_cluster = int(test_df.loc[test_df['user id'] == test_user]["cluster"])


    family_users = train_df.loc[train_df['cluster'] == test_cluster]

    family_users_ids = family_users['user id'].tolist()
    count_fam = 0
    cum_fam = 0
    for f_user in family_users_ids:
      if test_movie in pivot_table_df.columns:

        if pivot_table_df[test_movie][f_user] > -1:
          count_fam+=1
          cum_fam+=pivot_table_df[test_movie][f_user]
    if count_fam != 0:
      average_rating = cum_fam / count_fam
    else:
      average_rating = default_rating
    error = abs(average_rating - test_rating)
    test_mae.append(error)

  
  mae = mean(test_mae)
  fold_wise_mae.append(mae)

print("FOLD WISE MAE",fold_wise_mae)

FOLD WISE MAE [0.8267611772567701, 0.8264920720669157, 0.7958471711997138, 0.8382609677832437, 0.8012188094839464]


In [13]:
'''
Technique 15-MeanShift

'''
from sklearn.cluster import MeanShift
fold_wise_mae = []
for fold in folds_list:
  train_df = users_df_encoded.loc[~users_df_encoded['user id'].isin(fold)]
  test_df = users_df_encoded.loc[users_df_encoded['user id'].isin(fold)]

  train_ratings = data_df.loc[~data_df['user id'].isin(fold)]
  test_ratings = data_df.loc[data_df['user id'].isin(fold)]

  pivot_table_df = train_ratings.pivot_table(values='rating',columns='item id',index='user id')
  pivot_table_df_nn = pivot_table_df.fillna(0)

  scale = StandardScaler()
  X_scaled = scale.fit_transform(train_df)
  Xtest_scaled = scale.fit_transform(test_df)




  default_rating = 3.5


  clustering = MeanShift().fit(X_scaled)

  preds = clustering.predict(X_scaled)


  pca = PCA(n_components=2)
  X_pca = pca.fit_transform(X_scaled)

  train_df['cluster'] = preds


  

  
  test_preds = clustering.predict(Xtest_scaled)
  test_df['cluster'] = test_preds

  test_mae = []
  for index, row in test_ratings.iterrows():

    test_user = row['user id']
    test_movie = row['item id']
    test_rating = row['rating']

    test_cluster = int(test_df.loc[test_df['user id'] == test_user]["cluster"])

    family_users = train_df.loc[train_df['cluster'] == test_cluster]

    family_users_ids = family_users['user id'].tolist()
    count_fam = 0
    cum_fam = 0
    for f_user in family_users_ids:
      if test_movie in pivot_table_df.columns:
        
        if pivot_table_df[test_movie][f_user] > -1:
          count_fam+=1
          cum_fam+=pivot_table_df[test_movie][f_user]
    if count_fam != 0:
      average_rating = cum_fam / count_fam
    else:
      average_rating = default_rating
    error = abs(average_rating - test_rating)
    test_mae.append(error)

  mae = mean(test_mae)
  fold_wise_mae.append(mae)

print("FOLD WISE MAE",fold_wise_mae)

FOLD WISE MAE [0.8542659184604061, 0.8400035281527659, 0.8048024392812169, 0.7950289426775804, 0.8008692547294297]
