In [1]:
import re
import fasttext
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import Normalizer

pd.set_option("max_colwidth", None)

## Articles

In [2]:
%%time
articles = pd.read_csv('data/articles.csv')
print(articles.shape)

(105542, 25)
CPU times: user 325 ms, sys: 39 ms, total: 364 ms
Wall time: 367 ms


In [3]:
articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulded, lightly padded cups that shape the bust and provide good support. Narrow adjustable shoulder straps and a narrow hook-and-eye fastening at the back. Without visible seams for greater comfort."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulded, lightly padded cups that shape the bust and provide good support. Narrow adjustable shoulder straps and a narrow hook-and-eye fastening at the back. Without visible seams for greater comfort."


---

In [4]:
%%time
customers = pd.read_csv('data/customers.csv')
print(customers.shape)

(1371980, 7)
CPU times: user 1.91 s, sys: 139 ms, total: 2.04 s
Wall time: 2.05 s


In [5]:
customers.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a91f8ca0d4b6efa8100
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93f4c830291c32bc3057
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6c9090f7dd3e38380dc
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2c5feb1ca5dff07c43e,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c222539af5973a23ae6d
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd4564743b005a805b1d


In [6]:
%%time
transactions = pd.read_csv('data/transactions_train.csv')
print(transactions.shape)

(31788324, 5)
CPU times: user 20 s, sys: 3.78 s, total: 23.8 s
Wall time: 25.8 s


In [7]:
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,685687004,0.016932,2


In [8]:
%%time
submission = pd.read_csv('data/sample_submission.csv')
print(submission.shape)

(1371980, 2)
CPU times: user 1.73 s, sys: 128 ms, total: 1.86 s
Wall time: 1.86 s


In [9]:
submission.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,0706016001 0706016002 0372860001 0610776002 0759871002 0464297007 0372860002 0610776001 0399223001 0706016003 0720125001 0156231001
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa,0706016001 0706016002 0372860001 0610776002 0759871002 0464297007 0372860002 0610776001 0399223001 0706016003 0720125001 0156231001
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,0706016001 0706016002 0372860001 0610776002 0759871002 0464297007 0372860002 0610776001 0399223001 0706016003 0720125001 0156231001
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2c5feb1ca5dff07c43e,0706016001 0706016002 0372860001 0610776002 0759871002 0464297007 0372860002 0610776001 0399223001 0706016003 0720125001 0156231001
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a,0706016001 0706016002 0372860001 0610776002 0759871002 0464297007 0372860002 0610776001 0399223001 0706016003 0720125001 0156231001


---

## Train fasttext

In [10]:
def preprocess_string(s):
    s = s.lower()
    s = ''.join(re.findall('[a-z]', s))
    return s

In [11]:
articles['ppn'] = articles.product_type_name.apply(lambda x: preprocess_string(x))

In [12]:
%%time
embedding_train = transactions[['customer_id', 'article_id']].merge(
    articles[['article_id', 'ppn']], 
    on=['article_id'],
    how='left')[['customer_id', 'ppn']].drop_duplicates().groupby('customer_id')['ppn'].apply(list)

CPU times: user 30.7 s, sys: 4.37 s, total: 35 s
Wall time: 36.2 s


In [13]:
print(embedding_train.tolist()[1])

['dress', 'shirt', 'skirt', 'sweater', 'bra', 'jacket', 'underwearbottom', 'trousers', 'bikinitop', 'swimwearbottom', 'blazer', 'flatshoe', 'necklace', 'tshirt', 'vesttop', 'top', 'boots', 'sandals', 'bag', 'cappeaked', 'cardigan', 'swimsuit', 'garmentset']


In [14]:
file = 'train_cat.txt'
with open(file, 'w') as f:
    for line in embedding_train:
        for i in line:
            f.write("%s " % i)

In [268]:
model = fasttext.train_unsupervised('train.txt', dim=40, verbose=1)

Read 10M words
Number of words:  128
Number of labels: 0
Progress: 100.0% words/sec/thread:  807448 lr:  0.000000 avg.loss:  2.724905 ETA:   0h 0m 0s


### create embedding

In [269]:
%%time
X_train = [model.get_sentence_vector(' '.join(x)) for x in embedding_train]

CPU times: user 12.1 s, sys: 838 ms, total: 13 s
Wall time: 14.8 s


In [270]:
%%time
# find more similar user with his/her history articles
example = embedding_train.tolist()[0]
print(len(example))
print(sorted(example))
print()
for ind, val in enumerate(embedding_train[1:]):
    if len(set(example) & set(val)) > 11 and len(val) < 14:
        print(ind)
        print(sorted(val))
        break

13
['blazer', 'dress', 'gloves', 'hoodie', 'jacket', 'jumpsuitplaysuit', 'shirt', 'skirt', 'sweater', 'top', 'trousers', 'tshirt', 'vesttop']

1337268
['bikinitop', 'blazer', 'dress', 'hoodie', 'jacket', 'jumpsuitplaysuit', 'shirt', 'skirt', 'sweater', 'top', 'trousers', 'tshirt', 'vesttop']
CPU times: user 1.19 s, sys: 7.05 ms, total: 1.2 s
Wall time: 1.21 s


---