In [7]:
import pandas as pd
import zipfile
import csv
import random
import time
import numpy as np
from scipy import sparse as sp
from scipy.sparse.linalg import norm
import sklearn.preprocessing as pp
import pickle  

In [2]:
zf = zipfile.ZipFile('h-and-m-personalized-fashion-recommendations.zip') 

In [3]:
#Customers datafile
customers = pd.read_csv(zf.open('customers.csv'))

In [2]:
### Start Here

filehandler = open("transactions_6mth.pkl", "rb")
transactions_6mth = pickle.load(filehandler)
filehandler.close()
transactions_6mth

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
16118327,2019-09-01,02dd164d87472e7311cd12ea8df993b4a084ad528ed4af...,552716001,0.033881,2
16118467,2019-09-01,046c9b461a6f7a92512c73080f9dd85980e2e421a49f11...,562245046,0.033881,2
16118468,2019-09-01,046c9b461a6f7a92512c73080f9dd85980e2e421a49f11...,562245046,0.033881,2
16118668,2019-09-01,06207e084340d4a79aab1bfa0e5b5a63d9b7e89e665112...,160442043,0.013542,1
16119188,2019-09-01,0af4ebb4b23fc037e593c0e9a33cf53fc0ee37dd6ce35c...,711053007,0.013542,1
...,...,...,...,...,...
31788300,2020-09-22,ffc2e7c210e3ea602e6d229116773cc0588c929f8cc70b...,858856002,0.042356,1
31788319,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,929511001,0.059305,2
31788320,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,891322004,0.042356,2
31788322,2020-09-22,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,833459002,0.006763,1


In [3]:
test = transactions_6mth.loc[transactions_6mth.t_dat >= '2020-09-16',:]
train = transactions_6mth.loc[transactions_6mth.t_dat < '2020-09-16',:]

In [40]:
print(train.article_id.nunique())
print(test.article_id.nunique())
print(transactions_6mth.article_id.nunique())

6185
6206
6206


# Data Prep

In [5]:
customerIds = train.customer_id.unique()
customerIds.sort()
articleIds = train.article_id.unique()
articleIds.sort()

m = customerIds.size
n = articleIds.size
numTrans = len(train)

customerIds_to_customerIdsIDX = dict(zip(customerIds, range(0, customerIds.size )))
customerIDX_to_customerId = dict(zip(range(0, customerIds.size), customerIds))

itemId_to_itemIDX = dict(zip(articleIds, range(0, articleIds.size)))
itemIDX_to_itemId = dict(zip(range(0, articleIds.size), articleIds))

train['rating'] = list(np.full(len(train), 1))

df = pd.concat([train['customer_id'].map(customerIds_to_customerIdsIDX), train['article_id'].map(itemId_to_itemIDX), train['rating']], axis=1)
df.columns = ['customer', 'item', 'rating']
df.sort_values(by = 'customer', inplace = True)


display(df.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['rating'] = list(np.full(len(train), 1))


Unnamed: 0,customer,item,rating
29794828,0,3817,1
31140487,0,5156,1
30180376,0,4502,1
31412223,0,4103,1
31140486,0,5055,1


In [6]:
#df = df.loc[df.customer < 100,:]

In [7]:
df_trans_pic = pd.concat([train['t_dat'], train['customer_id'].map(customerIds_to_customerIdsIDX), train['article_id'].map(itemId_to_itemIDX)], axis=1)
df_trans_pic.columns = ['date','customer', 'item']
df_trans_pic

Unnamed: 0,customer,item,rating
29794828,0,3817,1
31140487,0,5156,1
30180376,0,4502,1
31412223,0,4103,1
31140486,0,5055,1
...,...,...,...
28223529,11742,2550,1
28516682,11742,1927,1
28516683,11742,4425,1
30916081,11742,2680,1


# Create ratings matrix

In [8]:
R = sp.csr_matrix((df.rating, (df.customer, df.item)))
R_dok = R.todok()

In [9]:
m = R.shape[0]
n = R.shape[1]
numRatings = R.count_nonzero()

print("There are", m, "users,", n, "items and", numRatings, "ratings.")

There are 11743 users, 6185 items and 194599 ratings.


In [10]:
R_copy = R.copy()
user_cnts = (R != 0).sum(axis=1).A1
user_norm = np.sqrt(R_copy.power(2).sum(1)).A1

# set scaling of rows with 0 zero to 0
zero_index = user_norm == 0
user_norm[zero_index] = 1
user_scales = 1 / user_norm
user_scales[zero_index] = 0

R_copy.data = R_copy.data * np.repeat(user_scales, user_cnts)

In [11]:
def compute_pairwise_user_similarity(u_id, v_id):
    u = R[u_id,:].copy()
    v = R[v_id,:].copy()
    
    numerator = u.dot(v.T)
    denominator = norm(u)*norm(v)
    #print(u)
    #print(norm(v))
    #print(v)
    
    if denominator == 0:
        similarity = 0.;
    else:
        similarity = (numerator/denominator).A.item()
    
    return similarity

In [12]:
def compute_user_similarities(u_id):
    uU = np.empty((m,))
        
    uU = (R_copy.dot(R_copy[u_id,:].T)).A.flatten()
    
    ##############################
    ## Mat.dot(Vec.T) is actually same as Mat @ Vec.T
    ##############################
    
    
    # YOUR CODE HERE
    #for i in range(uU.size):
    #    uU[i] = compute_pairwise_user_similarity(u_id, i)
    
    #raise NotImplementedError()
    
    return uU

In [13]:
def create_sim_list(u_id):
    uU = compute_user_similarities(u_id)
    uU_copy = uU.copy() ## so that we can modify it, but also keep the original
    
    # YOUR CODE HERE
    uU_copy = np.delete(uU_copy, u_id)

    # Sort array    
    sort_order = np.argsort(uU_copy)[::-1]
    uU_copy = uU_copy[sort_order]

    users_sorted = list(df.customer.unique())
    users_sorted = np.delete(users_sorted, u_id)
    users_sorted = users_sorted[sort_order]
    
    return users_sorted

In [14]:

def get_prediction(uU, users_sorted, i_id, n_neighbor):
    nh_user = []
    nh_sim = []
    
    i = 0
    while len(nh_sim) < n_neighbor:
        user_i = users_sorted[i]
        #print(nh_sim)
        #print(user_i)
    	#uU[user_i])
        if (user_i, i_id) in R_dok:
            nh_sim.append(uU[i])
            nh_user.append(user_i)
        i += 1
        #print(i)
        if i == len(uU):
            break

    prediction = sum(nh_sim)
    
    return prediction

In [15]:
def get_recommendations(u_id, n_neighbor, n_item_recommend):
    # get user similarities
    uU = compute_user_similarities(u_id)
    uU_copy = uU.copy() ## so that we can modify it, but also keep the original

    # Sort arrays by highest sim
    sort_order = np.argsort(uU_copy)[::-1]
    uU_copy = uU_copy[sort_order]
    
    # delete the user from the sim list
    uU_copy = np.delete(uU_copy, 0)

    users_sorted = np.array(df.customer.unique())
    users_sorted = users_sorted[sort_order]
    users_sorted = np.delete(users_sorted, 0)
    
    item_arr = np.array(df.item.unique())
    pred_arr = np.zeros(len(item_arr))
    k = n_neighbor
    i = 0
    for i_id in item_arr:
        #print(u_id, i_id)
        prediction = get_prediction(uU_copy, users_sorted, i_id, n_neighbor)
        pred_arr[i] = prediction
        i += 1
    
    item_sort_order = np.argsort(pred_arr)[::-1]
    item_arr = item_arr[item_sort_order][0:n_item_recommend]
    
    return item_arr

In [34]:
start = time.perf_counter()

rec_dict = {}

customer_list = df.customer.unique()[500:700]

for u_id in customer_list:
    recs = get_recommendations(u_id, n_neighbor = 5, n_item_recommend = 12)
    rec_dict[u_id] = recs

end = time.perf_counter()
end - start    
    

3191.5678844

In [23]:
#recs = get_recommendations(1, n_neighbor = 5, n_item_recommend = 12)

In [24]:
rec_dict

{0: array([1174, 2412, 4290, 4339, 3694, 3841, 5199, 4296, 5955, 3202, 5971,
        4916], dtype=int64),
 1: array([ 881, 1290, 1366, 1797, 4879, 1371, 1300,  413, 1716, 4189,  766,
         839], dtype=int64),
 2: array([2045, 2047,  308,  148, 3545, 2053, 1047,  967, 1146, 3242, 1291,
        2046], dtype=int64),
 3: array([ 398, 4706, 2804, 1047, 1211,  144, 2445, 5667, 2961, 5785, 2045,
        1003], dtype=int64),
 4: array([ 881, 1290, 1448, 2997, 4189,  413,  766,  839, 1716, 1300, 2182,
        2186], dtype=int64),
 5: array([ 881, 1290, 4919, 2997, 2548, 3482, 2107, 4106, 4641, 1938,  331,
        4097], dtype=int64),
 6: array([ 176, 2515, 2735, 3385, 5462, 3892, 1163, 3612, 4996, 4999,  767,
        4814], dtype=int64),
 7: array([1508, 5604, 4351, 1098, 2250, 4974, 3483, 5665,  454, 5109, 3499,
        2293], dtype=int64),
 8: array([1047, 2045, 1705, 5345, 1532,  148, 4467,  967, 5578, 1922, 4443,
        5604], dtype=int64),
 9: array([1461,  785, 3007, 1047, 1705, 5345,

In [None]:
# Save UU recommendations
import pickle  
filehandler = open("uu_dict.pkl", 'wb') 
pickle.dump(rec_dict, filehandler)
filehandler.close()

In [110]:
# Open UU recommendations
filehandler = open("uu_dict.pkl", "rb")
output = pickle.load(filehandler)
filehandler.close()


dict

# Evaluate Recommendations

### Note: data needed to be evaluated in pieces due to computation

In [29]:
def calc_cust_precision(u_id):
    ground_truth = cust_orders[u_id]
    hits_list = (np.isin(rec_dict[u_id], ground_truth))*1
    ap = 0 ## average precision
    hits = np.sum(hits_list)
    relevant_n = len(ground_truth)

    if (hits != 0):
        for rank in range(1,len(hits_list)+1):
            # update ap on hit items
            hit = hits_list[rank-1]
            if hit:
                rank_hit_list = hits_list[0:rank]
                prec_i = np.mean(rank_hit_list)
                ap += prec_i  
        ap /= (min(relevant_n, 12))
    
    return ap

In [30]:
test_df = pd.concat([test['customer_id'].map(customerIds_to_customerIdsIDX), test['article_id'].map(itemId_to_itemIDX)], axis=1)
test_df.columns = ['customer', 'item']
test_df.sort_values(by = 'customer', inplace = True)

cust_orders = test_df.groupby("customer")["item"].unique()

display(test_df.loc[test_df.customer == 1])

Unnamed: 0,customer,item
31723329,1.0,1795.0
31723330,1.0,593.0


In [35]:
ap_u = 0.

for u_id in customer_list:
    ap_u += calc_cust_precision(u_id)
mean_ap = ap_u / len(customer_list)       

In [38]:
mean_ap

0.05023826058201057

In [37]:
sum_ap_500_700 = mean_ap*len(customer_list)
#total

filehandler = open("sum_ap_500_700", 'wb') 
pickle.dump(sum_ap_500_700, filehandler)
filehandler.close()

In [44]:
#filehandler = open(str(x), "rb")
#sum_ap_0_test = pickle.load(filehandler)
#filehandler.close()
#print(sum_ap_0_test)

1


In [29]:
# get all of the saved files
sum_list = []

file_list = ['sum_ap_100', 'sum_ap_200', 'sum_ap_500_700', 'sum_ap_700_1000', 'sum_ap_1000_1500', 'sum_ap_9000_11743']
for file in file_list:
    
    filehandler = open(file + '.pkl', "rb")
    sum_i = pickle.load(filehandler)
    filehandler.close()
    sum_list.append(sum_i)


In [31]:
# Check if each metric seems reasonable
div_list = [100,300,200,300,500,11743-9000]

res = [i / j for i, j in zip(sum_list, div_list)]
print(sum_list)
print(div_list)
print(res)

[6.056878306878306, 15.61100609267276, 10.047652116402114, 17.820128066378068, 20.8424062049062, 139.6703869047619]
[100, 300, 200, 300, 500, 2743]
[0.060568783068783064, 0.05203668697557587, 0.05023826058201057, 0.05940042688792689, 0.0416848124098124, 0.050918843202610975]


In [32]:
np.sum(sum_list)/np.sum(div_list)

0.05069960359449659