In [1]:
import pandas as pd

from tqdm import tqdm

pd.set_option("max_colwidth", None)

## load data

In [2]:
%%time
transactions = pd.read_csv(
    'data/transactions_train.csv',
    usecols=['t_dat', 'customer_id', 'article_id'],
    dtype={'article_id': str})
transactions.shape

CPU times: user 15.7 s, sys: 1.58 s, total: 17.3 s
Wall time: 18 s


(31788324, 3)

In [3]:
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,663713001
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,541518023
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,505221004
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,685687003
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,685687004


In [4]:
%%time
submission = pd.read_csv('data/sample_submission.csv')
submission.shape

CPU times: user 1.44 s, sys: 105 ms, total: 1.54 s
Wall time: 1.54 s


(1371980, 2)

## preprocess data

In [5]:
transactions.t_dat = pd.to_datetime(transactions.t_dat)

In [6]:
transactions.t_dat.max()

Timestamp('2020-09-22 00:00:00')

In [7]:
df_1w = transactions[transactions['t_dat'] >= pd.to_datetime('2020-09-15')].copy()
df_2w = transactions[(transactions['t_dat'] >= pd.to_datetime('2020-09-08'))].copy()
df_3w = transactions[transactions['t_dat'] >= pd.to_datetime('2020-09-01')].copy()
df_4w = transactions[transactions['t_dat'] >= pd.to_datetime('2020-08-13')].copy()

In [8]:
purchase_dict_1w = {}

for row in tqdm(zip(df_1w['customer_id'], df_1w['article_id'])):
    cust_id, art_id = row
    if cust_id not in purchase_dict_1w:
        purchase_dict_1w[cust_id] = {}
    if art_id not in purchase_dict_1w[cust_id]:
        purchase_dict_1w[cust_id][art_id] = 0
    purchase_dict_1w[cust_id][art_id] += 1

266364it [00:00, 1806479.12it/s]


In [9]:
purchase_dict_2w = {}

for row in tqdm(zip(df_2w['customer_id'], df_2w['article_id'])):
    cust_id, art_id = row
    if cust_id not in purchase_dict_2w:
        purchase_dict_2w[cust_id] = {}
        
    if art_id not in purchase_dict_2w[cust_id]:
        purchase_dict_2w[cust_id][art_id] = 0
    purchase_dict_2w[cust_id][art_id] += 1

531967it [00:00, 1775886.83it/s]


In [10]:
purchase_dict_3w = {}

for row in tqdm(zip(df_3w['customer_id'], df_3w['article_id'])):
    cust_id, art_id = row
    if cust_id not in purchase_dict_3w:
        purchase_dict_3w[cust_id] = {}
        
    if art_id not in purchase_dict_3w[cust_id]:
        purchase_dict_3w[cust_id][art_id] = 0
    purchase_dict_3w[cust_id][art_id] += 1

798269it [00:00, 1891861.74it/s]


In [11]:
purchase_dict_4w = {}

for row in tqdm(zip(df_4w['customer_id'], df_4w['article_id'])):
    cust_id, art_id = row
    if cust_id not in purchase_dict_4w:
        purchase_dict_4w[cust_id] = {}
        
    if art_id not in purchase_dict_4w[cust_id]:
        purchase_dict_4w[cust_id][art_id] = 0
    purchase_dict_4w[cust_id][art_id] += 1

1494173it [00:00, 1937276.48it/s]


In [27]:
dummy_list_1w = list((df_1w['article_id'].value_counts()).index)[:12]
dummy_list_2w = list((df_2w['article_id'].value_counts()).index)[:12]
dummy_list_3w = list((df_3w['article_id'].value_counts()).index)[:12]
dummy_list_4w = list((df_4w['article_id'].value_counts()).index)[:12]

In [28]:
not_so_fancy_but_fast_benchmark = submission[['customer_id']]
prediction_list = []

dummy_pred = ' '.join(transactions[(transactions.t_dat>='2020-09-08') & (transactions.t_dat < pd.to_datetime('2020-09-15'))].article_id.value_counts()[:12].index)

In [29]:
for cust_id in tqdm(submission['customer_id'].values.reshape((-1,))):
    if cust_id in purchase_dict_1w:
        l = sorted((purchase_dict_1w[cust_id]).items(), key=lambda x: x[1], reverse=True)
        l = [y[0] for y in l]
        if len(l)>12:
            s = ' '.join(l[:12])
        else:
            s = ' '.join(l + dummy_list_1w[: (12 - len(l))])
    elif cust_id in purchase_dict_2w:
        l = sorted((purchase_dict_2w[cust_id]).items(), key=lambda x: x[1], reverse=True)
        l = [y[0] for y in l]
        if len(l)>12:
            s = ' '.join(l[:12])
        else:
            s = ' '.join(l + dummy_list_2w[: (12 - len(l))])
    elif cust_id in purchase_dict_3w:
        l = sorted((purchase_dict_3w[cust_id]).items(), key=lambda x: x[1], reverse=True)
        l = [y[0] for y in l]
        if len(l)>12:
            s = ' '.join(l[:12])
        else:
            s = ' '.join(l + dummy_list_3w[: (12 - len(l))])
    elif cust_id in purchase_dict_4w:
        l = sorted((purchase_dict_4w[cust_id]).items(), key=lambda x: x[1], reverse=True)
        l = [y[0] for y in l]
        if len(l)>12:
            s = ' '.join(l[:12])
        else:
            s = ' '.join(l+dummy_list_4w[:(12-len(l))])
    else:
        s = dummy_pred
    prediction_list.append(s)

100%|████████████████████████████| 1371980/1371980 [00:01<00:00, 1310615.52it/s]


In [32]:
not_so_fancy_but_fast_benchmark['prediction'] = prediction_list
print(not_so_fancy_but_fast_benchmark.shape)
not_so_fancy_but_fast_benchmark.head()

(1371980, 2)


Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,0568601043 0751471001 0909370001 0918522001 0924243001 0918292001 0915526001 0448509014 0915529003 0751471043 0706016001 0865799006
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa,0909370001 0865799006 0918522001 0448509014 0751471001 0924243001 0918292001 0762846027 0863646001 0809238001 0715624001 0673677002
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,0794321007 0924243001 0924243002 0923758001 0918522001 0909370001 0866731001 0751471001 0915529003 0915529005 0448509014 0762846027
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2c5feb1ca5dff07c43e,0909370001 0865799006 0918522001 0448509014 0751471001 0924243001 0918292001 0762846027 0863646001 0809238001 0715624001 0673677002
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a,0909370001 0865799006 0918522001 0448509014 0751471001 0924243001 0918292001 0762846027 0863646001 0809238001 0715624001 0673677002


In [52]:
not_so_fancy_but_fast_benchmark.to_csv('result/not_so_fancy_but_fast_benchmark.csv', index=False)