# User-based collaborative filtering with cosine similarity

Recommend 12 items to returning customers who haven't purchased those items based on user-based collaborative filtering

- Use transaction data from 1st-7th day and 15th-21st day each month (2018-10 ~ 2020-08) to build user-based cosine similarity
- Use most similar customers(users) to identify 12 items that were not purchased by target customers
- 12 recommended items were validated using data from from 8th-14th (for 1st-7th data) and 22nd-28th (for 15th-21st data) day each month 
- calcuate AOP@12 and MAP@12 to evaluate results

In [1]:
import pandas as pd
import numpy as np
import collaborative_filter as cf

In [None]:
%%time
transaction_raw = pd.read_csv("../data/transactions_train.csv")

In [None]:
#set up year-month and dates for training and testing
eval_month = ['2018-10','2018-11','2018-12','2019-01','2019-02','2019-03','2019-04','2019-05','2019-06','2019-07',
              '2019-09','2019-10','2019-11','2019-12','2020-01','2020-02','2020-03','2020-04','2020-05','2020-06',
              '2020-07','2020-08']
traintest_day = ['01','07','08','14','15','21','22','28']

In [None]:
%%time

#empty list to store results
AOP_results = []
MAP_results = []

#main analysis for each period
for i in range(len(eval_month)): #i: 0-21
    for j in [0,4]: #j:0, 4
        
        mask_train = (transaction_raw.t_dat >= eval_month[i]+'-'+traintest_day[j]) & \
                     (transaction_raw.t_dat <= eval_month[i]+'-'+traintest_day[j+1])
        X_train = transaction_raw.loc[mask_train].reset_index(drop=True)[['customer_id','article_id']]
        mask_test = (transaction_raw.t_dat >= eval_month[i]+'-'+traintest_day[j+2]) & \
                    (transaction_raw.t_dat <= eval_month[i]+'-'+traintest_day[j+3])
        X_test = transaction_raw.loc[mask_test].reset_index(drop=True)[['customer_id','article_id']]
        
        #Get recommended items via user-based collaborative filtering
        recommend_items = cf.all_user_collaborative_filter(X_train.head(100000), 
                                                           similar_user_number = 250, max_recommend_items = 12)
        
        #calculate AOP@12 for each train-test group
        AOP12_cf = cf.AOP(recommend_items, X_test)
        AOP_results.append(AOP12_cf)
        
        #calculate MAP@12 for each train-test group
        MAP12_cf = cf.MAP(recommend_items, X_test)
        MAP_results.append(MAP12_cf)

In [None]:
print("AOP size:", len(AOP_results))
print("MAP size:", len(MAP_results))

In [None]:
#create a timeline list for AOP and MAP
timeline = []
for month in eval_month:
    month_mid = month + '-15'
    month_late = month + '-22'
    timeline.append(month_mid)
    timeline.append(month_late)

In [None]:
#save results in a dataframe
result_df = pd.DataFrame({'timeline':timeline,
                          'AOP':AOP_results,
                          'MAP':MAP_results})

#save the dataframe in csv
result_df.to_csv("../data/userCF_results.csv")

result_df.head()

In [35]:
for user, recommend_Items in recommend_items.items():
    if len(recommend_Items) < 12:
        print("Items < 12")

In [7]:
mask_train = (transaction_raw.t_dat >= '2019-05-01') & (transaction_raw.t_dat <= '2019-05-10')
X_train = transaction_raw.loc[mask_train].reset_index(drop=True)[['customer_id','article_id']]
mask_test = (transaction_raw.t_dat >= '2019-05-01') & (transaction_raw.t_dat <= '2019-05-10')
X_test = transaction_raw.loc[mask_test].reset_index(drop=True)[['customer_id','article_id']]


recommend_items = cf.all_user_collaborative_filter(X_train.head(100000), similar_user_number = 100, max_recommend_items = 12)
MAP12_cf = cf.MAP(recommend_items, X_test)
