In [1]:
import cudf
print('RAPIDS version',cudf.__version__)

RAPIDS version 21.10.01


In [2]:
train = cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')
train['customer_id'] = train['customer_id'].str[-16:].str.hex_to_int().astype('int64')
train['article_id'] = train.article_id.astype('int32')
train.t_dat = cudf.to_datetime(train.t_dat)
train = train[['t_dat','customer_id','article_id']]
train.to_parquet('train.pqt',index=False)
print( train.shape )
train.head()

(31788324, 3)


Unnamed: 0,t_dat,customer_id,article_id
0,2018-09-20,-6846340800584936,663713001
1,2018-09-20,-6846340800584936,541518023
2,2018-09-20,-8334631767138808638,505221004
3,2018-09-20,-8334631767138808638,685687003
4,2018-09-20,-8334631767138808638,685687004


# Find Each Customer's Last Week of Purchases


In [3]:
tmp = train.groupby('customer_id').t_dat.max().reset_index()
tmp.columns = ['customer_id','max_dat']
train = train.merge(tmp,on=['customer_id'],how='left')
train['diff_dat'] = (train.max_dat - train.t_dat).dt.days
train = train.loc[train['diff_dat']<=6]
print('Train shape:',train.shape)

Train shape: (5181535, 5)


In [4]:
tmp = train.groupby(['customer_id','article_id'])['t_dat'].agg('count').reset_index()
tmp.columns = ['customer_id','article_id','ct']
train = train.merge(tmp,on=['customer_id','article_id'],how='left')
train = train.sort_values(['ct','t_dat'],ascending=False)
train = train.drop_duplicates(['customer_id','article_id'])
train = train.sort_values(['ct','t_dat'],ascending=False)
train.head()

Unnamed: 0,t_dat,customer_id,article_id,max_dat,diff_dat,ct
1132992,2019-07-16,2729025827381139556,719348003,2019-07-16,0,100
63299,2018-10-04,4485518665254175540,557247001,2018-10-04,0,86
2148934,2020-03-06,-906958334866810496,852521001,2020-03-06,0,81
3392992,2020-07-06,3601599666106972342,685813001,2020-07-06,0,80
853856,2019-05-14,-4601407992705575197,695545001,2019-05-14,0,80


In [5]:
# USE PANDAS TO MAP COLUMN WITH DICTIONARY
import pandas as pd, numpy as np
train = train.to_pandas()
pairs = np.load('../input/hmitempairs/pairs_cudf.npy',allow_pickle=True).item()
train['article_id2'] = train.article_id.map(pairs)

In [6]:
# RECOMMENDATION OF PAIRED ITEMS
train2 = train[['customer_id','article_id2']].copy()
train2 = train2.loc[train2.article_id2.notnull()]
train2 = train2.drop_duplicates(['customer_id','article_id2'])
train2 = train2.rename({'article_id2':'article_id'},axis=1)

In [7]:
# CONCATENATE PAIRED ITEM RECOMMENDATION AFTER PREVIOUS PURCHASED RECOMMENDATIONS
train = train[['customer_id','article_id']]
train = pd.concat([train,train2],axis=0,ignore_index=True)
train.article_id = train.article_id.astype('int32')
train = train.drop_duplicates(['customer_id','article_id'])

In [8]:
# CONVERT RECOMMENDATIONS INTO SINGLE STRING
train.article_id = ' 0' + train.article_id.astype('str')
preds = cudf.DataFrame( train.groupby('customer_id').article_id.sum().reset_index() )
preds.columns = ['customer_id','prediction']
preds.head()

Unnamed: 0,customer_id,prediction
0,-9223352921020755230,0673396002 0812167004 0706016001 0812167002
1,-9223343869995384291,0908292002 0910601003 0865929007 0903926002 0...
2,-9223321797620987725,0580600006 0610776035 0688018003 0610776002
3,-9223319430705797669,0470985003 0504155001 0554477005 0562245001 0...
4,-9223308614576639426,0750423005 0750423001



After recommending previous purchases and items purchased together we will then recommend the 12 most popular items. Therefore if our previous recommendations did not fill up a customer's 12 recommendations, then it will be filled by popular items.

In [9]:
train = cudf.read_parquet('train.pqt')
train.t_dat = cudf.to_datetime(train.t_dat)
train = train.loc[train.t_dat >= cudf.to_datetime('2020-09-16')]
top12 = ' 0' + ' 0'.join(train.article_id.value_counts().to_pandas().index.astype('str')[:12])
print("Last week's top 12 popular items:")
print( top12 )

Last week's top 12 popular items:
 0924243001 0924243002 0918522001 0923758001 0866731001 0909370001 0751471001 0915529003 0915529005 0448509014 0762846027 0714790020
