In [1]:
import pandas as pd  
from sklearn.model_selection import GroupShuffleSplit

In [2]:
import os
import os.path
from google.colab import drive

# mount Google Drive to /content/drive/My Drive/
if os.path.isdir("/content/drive/MyDrive/Web Mining"):
  print("Google Drive already mounted")
else:
  drive.mount('/content/drive',force_remount=True)  
os.chdir("/content/drive/MyDrive/Web Mining") 

Mounted at /content/drive


In [3]:
users = pd.read_csv("./Abhay/users_grouped.csv", skipinitialspace=True)
gs = GroupShuffleSplit(n_splits=2, test_size=0.2, random_state=42)
train_ix, test_ix = next(gs.split(users, groups=users["Group"]))
# print(users.head())
# print(len(users))
train = users.loc[train_ix]
# print(train) 
test = users.loc[test_ix]
users = users.drop(columns=['Gender', 'Age', 'Occupation'])

In [4]:
ratings = pd.read_csv("./ratings_new.csv")
ratings = ratings.drop('Timestamp', axis=1) 
print(ratings.head())

act_ratings_test = ratings.copy() 
act_ratings_test = act_ratings_test.loc[act_ratings_test['UserID'].isin(test['UserID'])]
act_ratings_test.reset_index(drop=True, inplace=True) # made sure the users are all in test! 

print(len(act_ratings_test))

   UserID  MovieID  Rating
0       1     1193       5
1       1      661       3
2       1      914       3
3       1     3408       4
4       1     2355       5
209646


In [36]:
# hybrid by 4 methods 
def hybrid(method): 
  path='./Final Data'
  name_cf = 'user_ratings_cf_' + method
  name_cf = name_cf + '.csv'
  input_file_cf = os.path.join(path, name_cf)

  name_cb = 'user_ratings_cb_' + method
  name_cb = name_cb + '.csv'
  input_file_cb = os.path.join(path, name_cb)

  cf_df = pd.read_csv(input_file_cf)
  cf_df = cf_df.rename(columns = {'Rating':'Rating_cf'})

  cb_df = pd.read_csv(input_file_cb)
  cb_df = cb_df.rename(columns = {'Rating':'Rating_cb'})
                
  merged_df = pd.merge(cf_df, cb_df, on=['UserID', 'MovieID']) 
  merged_df.Rating_cb.fillna(merged_df.Rating_cf, inplace=True)

  merged_df = aggregation(merged_df, method)
  # merged_df['Final_Rating'] = merged_df.apply(lambda x: x['Rating_cf'] / x['Rating_cb'], axis=1) 

  return merged_df

In [49]:
def aggregation(Database, method): 
  if (method == 'average'): 
    Database['Final_rating'] = Database.apply(lambda row: ((row.Rating_cf + row.Rating_cb) / 2), axis=1)
    return Database
  elif (method == 'least_misery'): 
    Database['Final_rating'] = Database.apply(lambda row: (min(row.Rating_cf, row.Rating_cb)), axis=1)
    return Database
  elif (method == 'most_pleasure'): 
    Database['Final_rating'] = Database.apply(lambda row: (max(row.Rating_cf, row.Rating_cb)), axis=1)
    return Database
  elif (method == 'average_without_misery'): 
    test = Database.copy()
    thres = 2
    test.loc[test['Rating_cf'] < thres,'Rating_cf'] = test['Rating_cb']
    test.loc[test['Rating_cb'] < thres,'Rating_cb'] = test['Rating_cf']
    # these two lines just make sure that both have to be above threshold, else huge penalty
    test.loc[(test['Rating_cb'] < thres) & (test['Rating_cb'] < thres), 'Rating_cb'] = 0
    test.loc[(test['Rating_cb'] < thres) & (test['Rating_cb'] < thres), 'Rating_cf'] = 0
    test['Final_rating'] = test.apply(lambda row: ((row.Rating_cf + row.Rating_cb) / 2), axis=1)
    return test

In [39]:
from sklearn.metrics import mean_squared_error 

def evaluation(database): 
  calculation = pd.merge(database, act_ratings_test, on = ['UserID', 'MovieID'])
  print(calculation)
  realVals = calculation['Rating']
  predictedVals = calculation.Final_rating
  mse = mean_squared_error(realVals, predictedVals)
  # If you want the root mean squared error
  rmse = mean_squared_error(realVals, predictedVals, squared = False) 
  return rmse

In [40]:
merged_ave = hybrid('average') 
# print(merged_ave)
print(evaluation(merged_ave))

        UserID  MovieID  Rating_cf  Rating_cb  Final_rating  Rating
0            2     1357   3.650502   3.465516      3.558009       5
1            2     3068   3.589013   3.318433      3.453723       4
2            2     1537   3.635573   3.380338      3.507955       4
3            2      647   3.565670   3.362192      3.463931       3
4            2     2194   3.860406   3.441492      3.650949       4
...        ...      ...        ...        ...           ...     ...
209641    6038     1387   3.796086   3.180861      3.488473       2
209642    6038     2700   3.666036   3.485532      3.575784       1
209643    6038     2716   3.774065   3.427263      3.600664       3
209644    6038     3396   3.645155   3.645155      3.645155       3
209645    6038     1079   3.736976   3.460307      3.598641       5

[209646 rows x 6 columns]
1.0667663472767426


In [41]:
merged_lm = hybrid('least_misery') 
print(merged_lm)
print(evaluation(merged_lm))

        UserID  MovieID  Rating_cf  Rating_cb  Final_rating
0            2     1357   3.650502   3.465516      3.465516
1            2     3068   3.589013   3.318433      3.318433
2            2     1537   3.635573   3.380338      3.380338
3            2      647   3.565670   3.362192      3.362192
4            2     2194   3.860406   3.441492      3.441492
...        ...      ...        ...        ...           ...
209641    6038     1387   3.796086   3.180861      3.180861
209642    6038     2700   3.666036   3.485532      3.485532
209643    6038     2716   3.774065   3.427263      3.427263
209644    6038     3396   3.645155   3.645155      3.645155
209645    6038     1079   3.736976   3.460307      3.460307

[209646 rows x 5 columns]
        UserID  MovieID  Rating_cf  Rating_cb  Final_rating  Rating
0            2     1357   3.650502   3.465516      3.465516       5
1            2     3068   3.589013   3.318433      3.318433       4
2            2     1537   3.635573   3.380338    

In [44]:
merged_mp = hybrid('most_pleasure') 
evaluation(merged_mp)

        UserID  MovieID  Rating_cf  Rating_cb  Final_rating  Rating
0            2     1357   3.650502   3.465516      3.650502       5
1            2     3068   3.589013   3.318433      3.589013       4
2            2     1537   3.635573   3.380338      3.635573       4
3            2      647   3.565670   3.362192      3.565670       3
4            2     2194   3.860406   3.441492      3.860406       4
...        ...      ...        ...        ...           ...     ...
209641    6038     1387   3.796086   3.180861      3.796086       2
209642    6038     2700   3.666036   3.485532      3.666036       1
209643    6038     2716   3.774065   3.427263      3.774065       3
209644    6038     3396   3.645155   3.645155      3.645155       3
209645    6038     1079   3.736976   3.460307      3.736976       5

[209646 rows x 6 columns]


1.0566940474685238

In [50]:
merged_awm = hybrid('average_without_misery') 
evaluation(merged_awm)

        UserID  MovieID  Rating_cf  Rating_cb  Final_rating  Rating
0            2     1357   3.650502   3.465516      3.558009       5
1            2     3068   3.589013   3.318433      3.453723       4
2            2     1537   3.635573   3.380338      3.507955       4
3            2      647   3.565670   3.362192      3.463931       3
4            2     2194   3.860406   3.441492      3.650949       4
...        ...      ...        ...        ...           ...     ...
209641    6038     1387   3.796086   3.180861      3.488473       2
209642    6038     2700   3.666036   3.485532      3.575784       1
209643    6038     2716   3.774065   3.427263      3.600664       3
209644    6038     3396   3.645155   3.645155      3.645155       3
209645    6038     1079   3.736976   3.460307      3.598641       5

[209646 rows x 6 columns]


1.0677142577829206

Baselines (without grouping and aggregation - single user prediction) 

In [57]:
single_cf = pd.read_csv('./Final Data/Single-CF-test.csv')
single_cf = single_cf.rename(columns = {'Rating':'Pred_Rating'})
single_cf

Unnamed: 0,UserID,MovieID,Pred_Rating
0,2,1357,3.650502
1,2,3068,3.589013
2,2,1537,3.635573
3,2,647,3.565670
4,2,2194,3.860406
...,...,...,...
209641,6038,1387,3.796086
209642,6038,2700,3.666036
209643,6038,2716,3.774065
209644,6038,3396,3.645155


In [56]:
single_bert = pd.read_csv('./Final Data/single_bert_test.csv')
single_bert = single_bert.rename(columns = {'Rating':'Pred_Rating'})
single_bert = single_bert.drop(['Ranking'], axis=1)

In [58]:
def evaluation_single(database): 
  calculation = pd.merge(database, act_ratings_test, on = ['UserID', 'MovieID'])
  print(calculation)
  realVals = calculation['Rating']
  predictedVals = calculation.Pred_Rating
  mse = mean_squared_error(realVals, predictedVals)
  # If you want the root mean squared error
  rmse = mean_squared_error(realVals, predictedVals, squared = False) 
  return rmse

In [59]:
evaluation_single(single_cf)

        UserID  MovieID  Pred_Rating  Rating
0            2     1357     3.650502       5
1            2     3068     3.589013       4
2            2     1537     3.635573       4
3            2      647     3.565670       3
4            2     2194     3.860406       4
...        ...      ...          ...     ...
209641    6038     1387     3.796086       2
209642    6038     2700     3.666036       1
209643    6038     2716     3.774065       3
209644    6038     3396     3.645155       3
209645    6038     1079     3.736976       5

[209646 rows x 4 columns]


1.0827400335072526

In [60]:
evaluation_single(single_bert)

        UserID  MovieID  Pred_Rating  Rating
0         3949      161     3.602635       4
1         3949      458     3.578604       2
2         3949     2922     3.561503       4
3         3949     1208     3.560698       4
4         3949     1250     3.560206       5
...        ...      ...          ...     ...
185429     926      661     3.442301       5
185430     926     2797     3.416370       3
185431     926      596     3.389020       4
185432     926       73     3.026847       5
185433     926     2987     2.958557       4

[185434 rows x 4 columns]


1.0991593357051375