In [1]:
import pandas as pd
import zipfile
import csv
import random
import time
import numpy as np
import pickle

# Data Import

In [2]:
zf = zipfile.ZipFile('h-and-m-personalized-fashion-recommendations.zip') 

In [3]:
#Customers datafile
customers = pd.read_csv(zf.open('customers.csv'))

In [6]:
transactions_train = pd.read_csv(zf.open('transactions_train.csv'))

# Subset transactions

### We will only use the most recent transactions, so there is no need to read in the whole file everytime

In [7]:
transactions_train.t_dat = pd.to_datetime(transactions_train.t_dat)
transactions_train = transactions_train.loc[transactions_train.t_dat >= '2019-03-01',:]

In [58]:
filehandler = open("transactions_train.pkl", 'wb') 
pickle.dump(transactions_train, filehandler)
filehandler.close()

In [6]:
filehandler = open("transactions_train.pkl", "rb")
transactions_train = pickle.load(filehandler)
filehandler.close()

In [7]:
test = transactions_train.loc[transactions_train.t_dat >= '2020-09-16',:]
train = transactions_train.loc[(transactions_train.t_dat < '2020-09-16') & (transactions_train.t_dat >= '2020-09-09'),:]

In [8]:
# How many items do customers typically buy?
test.customer_id.value_counts()

3860b5e65d48bb509d89e6b21ec0458e13c75eab2e95f53ea2269fafc01a5567    104
e4ea6ece6706e9c119a3640e09e842f5ae7d62a6b546d4f2448e08def4bd7283     95
54e8ebd39543b5a4d69c3e7d79977558d2a606e6540ba0a50e07001cfff202c2     90
6e38d8f80e5c6d0db8d348a50c3c6ac29b17dfd3ac83d1b6bc33f090f8c03ab1     69
49501893c2f65bf0a0b585e5a1c7022dd5139232d00bc68a57376cb897284102     60
                                                                   ... 
3bcab54ca90cc4c6b8fc46ee961f9f015d6a16250f8cd42c6bc8b6e1c21ee81b      1
3bc81d10babcab13c95474cb171a6f10416fe9a0fcfc5c8f559be616bae37d5c      1
3bbf1754b497073b0976dac95ae32e4b9006d0e0d104e2450573eec2a351c4f0      1
3b9f6e7907566378f605ca45ced5c4d08df27091f223a22794ebe493a71a8f68      1
fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20e02ce5d1e58a8f700b      1
Name: customer_id, Length: 68984, dtype: int64

In [9]:
print(test.shape)
print(test.customer_id.nunique())
print(test.article_id.nunique())

(240311, 5)
68984
17986


In [10]:
print(train.shape)
print(train.customer_id.nunique())
print(train.article_id.nunique())

(255241, 5)
72019
18611


# Subset data

### Only customer and items in both the train and test set will be used

In [11]:
customer_list = test.customer_id.unique()
#print(len(customer_list))
item_list = test.article_id.unique()

train = train.loc[train.customer_id.isin(customer_list),:]
print(train.customer_id.nunique())
print(train.article_id.nunique())
train = train.loc[train.article_id.isin(item_list),:]
print(train.customer_id.nunique())
print(train.article_id.nunique())

#Now drop test customers/items who are not in train
customer_list = train.customer_id.unique()
#print(len(customer_list))
item_list = train.article_id.unique()

test = test.loc[test.customer_id.isin(train.customer_id.unique()),:]
test = test.loc[test.article_id.isin(train.article_id.unique()),:]

12670
10043
12534
8794


# Final Counts

In [12]:
print(train.shape)
print(train.customer_id.nunique())
print(train.article_id.nunique())

print(test.shape)
print(test.customer_id.nunique())
print(test.article_id.nunique())

(49357, 5)
12534
8794
(36160, 5)
11765
6206


In [15]:
# Subset 6 month transaction data to customers in test set
transactions_6mth = transactions_train.loc[transactions_train.customer_id.isin(test.customer_id.unique()), :]
transactions_6mth = transactions_6mth.loc[transactions_6mth.article_id.isin(test.article_id.unique()), :]

# Save transactions, list of customers, list of items

In [None]:
filehandler = open("transactions_6mth.pkl", 'wb') 
pickle.dump(transactions_6mth, filehandler)
filehandler.close()

In [14]:
filehandler = open("customer_list.pkl", 'wb') 
pickle.dump(test.customer_id.unique(), filehandler)
filehandler.close()

filehandler = open("article_list.pkl", 'wb') 
pickle.dump(test.article_id.unique(), filehandler)
filehandler.close()

In [17]:
print(transactions_6mth.shape)
print(transactions_6mth.customer_id.nunique())
print(transactions_6mth.article_id.nunique())

(278334, 5)
11765
6206


# creating smaller set to test code with. Can ignore code below

In [17]:
customerIds = train.customer_id.unique()
customerIds.sort()
articleIds = train.article_id.unique()
articleIds.sort()

m = customerIds.size
n = articleIds.size
numTrans = len(train)

customerIds_to_customerIdsIDX = dict(zip(customerIds, range(0, customerIds.size )))
customerIDX_to_customerId = dict(zip(range(0, customerIds.size), customerIds))

itemId_to_itemIDX = dict(zip(articleIds, range(0, articleIds.size)))
itemIDX_to_itemId = dict(zip(range(0, articleIds.size), articleIds))

train['rating'] = list(np.full(len(train), 1))

df_train = pd.concat([train['customer_id'].map(customerIds_to_customerIdsIDX), train['article_id'].map(itemId_to_itemIDX), train['rating']], axis=1)
df_train.columns = ['customer', 'item', 'rating']
df_train.sort_values(by = 'customer', inplace = True)

test['rating'] = list(np.full(len(test), 1))

df_test = pd.concat([test['customer_id'].map(customerIds_to_customerIdsIDX), test['article_id'].map(itemId_to_itemIDX), test['rating']], axis=1)
df_test.columns = ['customer', 'item', 'rating']
df_test.sort_values(by = 'customer', inplace = True)


display(df_test.head())

Unnamed: 0,customer,item,rating
31755458,0,3134,1
31755460,1,648,1
31755461,1,126,1
31755462,2,1026,1
31755463,2,4459,1


In [21]:
# Create smaller sample for testing code
train_small = df_train.loc[df_train.customer < 1000, :]
train_small.shape

(15524, 3)

In [None]:
filehandler = open("train.pkl", 'wb') 
pickle.dump(df_train, filehandler)
filehandler.close()

In [None]:
filehandler = open("test.pkl", 'wb') 
pickle.dump(df_test, filehandler)
filehandler.close()

In [22]:
filehandler = open("train_small.pkl", 'wb') 
pickle.dump(train_small, filehandler)
filehandler.close()

In [23]:
train_small

Unnamed: 0,customer,item,rating
31140485,0,6233,1
29794828,0,4539,1
29794827,0,2173,1
29794829,0,4539,1
30180375,0,6468,1
...,...,...,...
30669401,999,6184,1
26156004,999,817,1
31105332,999,3463,1
26156002,999,3262,1


In [24]:
#Customers datafile
customers = pd.read_csv(zf.open('customers.csv'))

In [25]:
customers = customers.loc[customers.customer_id.isin(train.customer_id.unique()),:]

In [30]:
df_customers = pd.concat([customers['customer_id'].map(customerIds_to_customerIdsIDX), customers['age']], axis=1)
df_customers.columns = ['customer', 'age']
df_customers.sort_values(by = 'customer', inplace = True)

In [31]:
df_customers

Unnamed: 0,customer,age
86,0,33.0
349,1,25.0
429,2,30.0
821,3,48.0
1015,4,49.0
...,...,...
1371091,8931,30.0
1371691,8932,32.0
1371721,8933,67.0
1371747,8934,21.0


In [32]:
filehandler = open("df_customers.pkl", 'wb') 
pickle.dump(df_customers, filehandler)
filehandler.close()