In [1]:
import re
import fasttext
import numpy as np
import pandas as pd

from tqdm import tqdm
from collections import OrderedDict

pd.set_option("max_colwidth", None)

## load data

In [2]:
path = 'data/'
path_model = 'models/'

In [3]:
%%time
articles = pd.read_csv(path+'articles.csv')
print(f'\nArticles shape: {articles.shape}')

submission = pd.read_csv(path + 'sample_submission.csv')
print(f'\nSubmussion shape: {submission.shape}')

customers = pd.read_csv(path + 'customers.csv')
print(f'\nCustomer shape: {customers.shape}')

transactions = pd.read_csv(path+ 'transactions_train.csv')
print(f'\nTransactions shape: {transactions.shape}')


Articles shape: (105542, 25)

Submussion shape: (1371980, 2)

Customer shape: (1371980, 7)

Transactions shape: (31788324, 5)
CPU times: user 18.6 s, sys: 2.69 s, total: 21.3 s
Wall time: 22.4 s


## load model

In [4]:
%%time
model_name = 'ftext_bk_unsupervised_40.bin'
model = fasttext.load_model(path_model + model_name)

CPU times: user 104 ms, sys: 98.5 ms, total: 202 ms
Wall time: 238 ms




## helper functions

In [5]:
def cosine_similarity(vector1, vector2):
    vector1 = np.array(vector1)
    vector2 = np.array(vector2)
    return np.dot(vector1, vector2) / (np.sqrt(np.sum(vector1**2)) * np.sqrt(np.sum(vector2**2)))

## Preprocessing data

In [6]:
#
tqdm.pandas()
d_df = transactions.merge(articles[['article_id', 'prod_name']], left_on='article_id', right_on='article_id').progress_apply(lambda x: x)

100%|█████████████████████████████████████████████| 6/6 [00:05<00:00,  1.03it/s]


In [7]:
%%time
d_df['prod_name'] = d_df['prod_name'].str.replace(' ', '')

CPU times: user 5.53 s, sys: 1.62 s, total: 7.16 s
Wall time: 8.16 s


In [8]:
tqdm.pandas()
fasttext_train_text = d_df.groupby(['customer_id'])['prod_name'].progress_apply(' '.join).reset_index()

100%|██████████████████████████████| 1362281/1362281 [01:44<00:00, 13075.42it/s]


In [9]:
%%time
fasttext_train_text_dct = {}
for ind, val in fasttext_train_text.iterrows():
    fasttext_train_text_dct[val['customer_id']] = val['prod_name']

CPU times: user 19.6 s, sys: 623 ms, total: 20.3 s
Wall time: 20.9 s


In [None]:
%%time
y1_train = d_df.sort_values(by=['customer_id', 't_dat']).groupby(
    'customer_id').tail(1)[['customer_id', 'article_id']]

In [None]:
y1_train.shape[0] == d_df.customer_id.unique().shape[0]

In [None]:
%%time
y1_train_dct = {}
for ind, val in y1_train.iterrows():
    y1_train_dct[val['customer_id']] = val['article_id']

---

In [None]:
%%time
users = fasttext_train_text.customer_id.unique()[:10]

In [64]:
def rec_all_users(users):
    recommend = []  
    for ind, user in enumerate(users):
        if ind % 100_000 == 0:
            print(ind)
        recs = {}
        us= fasttext_train_text_dct[user]
        emb1 = model.get_sentence_vector(us)
        cos = OrderedDict()
        cnt = 0
        for ind, val in fasttext_train_text_dct.items():
            if cnt == 12:
                break
            items2 = val
            emb2 = model.get_sentence_vector(items2)
            cos_sim = cosine_similarity(emb1, emb2)
            if cos_sim > 0.6:
                cnt += 1  
                rec = y1_train_dct[ind]
                cos[rec] = cos_sim
        best = ['0' + str(x) for x, _ in sorted(cos.items(), key=lambda x: x[1], reverse=True)[:12]]
        recs['customer_id'] = user
        recs['prediction'] = ' '.join(best)
        recommend.append(recs)
    return recommend

In [65]:
%%time
# 9.33
# 9.38
similariti = rec_all_users(users)

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
CPU times: user 1h 17min 50s, sys: 19.1 s, total: 1h 18min 9s
Wall time: 1h 25min 57s


In [66]:
%%time
similariti_df = pd.DataFrame(similariti)

CPU times: user 459 ms, sys: 950 ms, total: 1.41 s
Wall time: 2.35 s


In [67]:
print(similariti_df.shape[0])
similariti_df.head()

1362281


Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,0568601043 0914441005 0794321007 0921226007 0896152002 0826211002 0857690004 0808651003 0719530003 0551080020 0850614001 0760084013
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa,0826211002 0794321007 0817166007 0719530003 0914441005 0896152002 0714824001 0921226007 0568601043 0808651003 0850614001 0740962001
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,0794321007 0896152002 0826211002 0817166007 0914441005 0568601043 0921226007 0857690004 0880553001 0808651003 0719530003 0714824001
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2c5feb1ca5dff07c43e,0742079001 0740962001 0817166007 0918292001 0914441005 0808840004 0909924004 0719530003 0714824001 0695324011 0841260012 0820671001
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a,0896152002 0794321007 0817166007 0826211002 0909924004 0568601043 0914441005 0921226007 0857690004 0808651003 0880553001 0719530003


In [68]:
submission.shape[0] - similariti_df.shape[0]

9699

In [69]:
cold_users = set(submission.customer_id.unique()) ^ set(similariti_df.customer_id.unique())

In [70]:
transactions.t_dat.max()

'2020-09-22'

In [71]:
%%time
blw = transactions[transactions.t_dat >= '2020-06-22'].article_id.value_counts()[:12].index.tolist()

CPU times: user 986 ms, sys: 170 ms, total: 1.16 s
Wall time: 1.89 s


In [72]:
cold_users_lst = []
for user in cold_users:
    tmp = {}
    tmp['customer_id'] = user
    tmp['prediction'] = '0' + ' '.join([str(x) for x in blw])
    cold_users_lst.append(tmp)

In [73]:
cold_users_df = pd.DataFrame(cold_users_lst)

In [74]:
result = pd.concat([similariti_df, cold_users_df], ignore_index=True)

In [75]:
if result.shape[0] == submission.shape[0]:
    result.to_csv('result/metric_similarity.csv', index=False)