In [1]:
import pickle
import itertools
import numpy as np
import pandas as pd
import fasttext

from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
from sklearn.metrics import silhouette_score
from tqdm import tqdm

pd.set_option("max_colwidth", None)
pd.set_option('display.max_rows', 200)

In [2]:
df = pd.read_csv('data/transactions_train.csv')
customers = pd.read_csv('data/customers.csv')
articles = pd.read_csv('data/articles.csv')

In [3]:
articles = articles.drop(['detail_desc'],axis=1)

In [4]:
tqdm.pandas()
d_df = df.merge(articles, left_on='article_id', right_on='article_id').progress_apply(lambda x: x)

100%|███████████████████████████████████████████| 28/28 [00:29<00:00,  1.05s/it]


In [5]:
d_df = d_df.drop(['t_dat', 'article_id', 'price', 'sales_channel_id',
       'product_code', 'product_type_no', 'product_type_name',
       'product_group_name', 'graphical_appearance_no',
       'graphical_appearance_name', 'colour_group_code', 'colour_group_name',
       'perceived_colour_value_id', 'perceived_colour_value_name',
       'perceived_colour_master_id', 'perceived_colour_master_name',
       'department_no', 'department_name', 'index_code', 'index_name',
       'index_group_no', 'index_group_name', 'section_no', 'section_name',
       'garment_group_no', 'garment_group_name'],axis=1)

In [6]:
add_prod_name_by_user = []

In [8]:
d_df['prod_name'] = d_df['prod_name'].str.replace(' ', '')

In [10]:
tqdm.pandas()
fasttext_train_text = d_df.groupby(['customer_id'])['prod_name'].progress_apply(' '.join).reset_index()

100%|██████████████████████████████| 1362281/1362281 [00:20<00:00, 67295.57it/s]


In [11]:
fasttext_train_text[:5]

Unnamed: 0,customer_id,prod_name
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,SerpenteHWslimtrouser MarietteBlazer MarietteBlazer FLORAparka BBChrispuffjktTP MrHarringtonw/hood TheFirm(1) Sophiejumpsuit SkirtMiniStretchEdie Juan Buggblazer Buggblazer SKIGLOVEBASIC Malm Twistteel/stop NottingHill Jentee Siraptee SPEEDPanameratop Arubadenimjkt MarietteBlazer
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa,SveaCroppedTank Henrypolo.(1) CORYCORDSKIRT SkirtMini PerrieTrashHWDenimTRS Noeldenimdress Noeldenimdress BasicGiginecklace Sirpamockneck LazerRazerBrief LazerRazerBrief LazerRazerBrief LolaLace-Up LolaLace-Up PortofinoISW28 Janetsweater Janetsweater Dido Thelmapoloneck Bristoljerseyblz LazerRazerAdj.pushtriangle LazerRazerAdj.pushtriangle LazerRazerAdj.pushtriangle Majkenloafer Fionabrazilian(Acacia)4p Bobbycutoutboot TimelessPaddedBra TimelessPaddedBra TimelessPaddedBra TimelessMidriseBrief TimelessMidriseBrief RosemaryCoat PerrieTrashHWDenimTRS KellyPush(Melbourne)ctn2p Tildatank Bellora(1) GirlfriendR.WTrash TimelessPushTriangle TimelessTieTanga GirlfriendR.WTrash Christinamid Montysandalwindow Pamela Pamela Claudine ClassJuditnecklace WRENSINGOALLATANK CROWRIBSTRAPTOP Judylacetop Drogbatenceltunic Pipertop RaileyTop PQ/PUFemininCrossbag Grandatank Beastalloverlacesoftbra EmbraceS.SkinnyH.WTrash EmbraceS.SkinnyH.WTrash Fleur Mossdress MulanBelted Chestnutstraptop Kiwisweatshirt TimelessMidriseBrief TimelessMidriseBrief NewGirlPushTop NewGirlPushTop DAVIDbasicset2PCS PushitPushBra. BorisFancyCap TimelessCheekyBrief TimelessCheekyBrief HULDAtie HULDAtie EpicPaddedSwimsuit TimelessMidriseBrief TimelessMidriseBrief TimelessMidriseBrief TimelessTriangleTop SuperwomanSuperpush SuperwomanSuperpush SuperwomanSuperpush SuperwomanSuperpush KnotBitterTop KnotBitterTop KnotBitterDetBrief KnotBitterDetBrief
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,AtlantaPushBodyHarlow AtlantaPushBodyHarlow RaePush(Melbourne)2p PlutoOTS LazerRazerBrief LazerRazerBrief LazerRazerAdj.pushtriangle R-NeckPisaTVP R-NeckPisaTVP SPEEDELLABELTEDBLOUSE Floridaskirt PEZSWEATER SnowchinoTVPRW SnowchinoTVPRW PEZSWEATER Seahorselinen Seahorselinen SULIMAjkt
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2c5feb1ca5dff07c43e,PANORAMAsportsbra Panoramamidsupportbra
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a,Hedwigessential JacketSlim JacketSlim Molly BabyLockMeUpPushTriangle ShakeitinTieBrief Valentinapaddedbody1 ElenapaddedLTbody Lashsoftbody HAVANAHWtights Battleshipdress Speedyconscioustee Amelie


In [12]:
with open('data/unsupervised_fasttext_train.txt', 'w', encoding="utf-8") as f:
    for line in tqdm(fasttext_train_text.prod_name.values):
        f.write(line)
        f.write('\n')

100%|████████████████████████████| 1362281/1362281 [00:00<00:00, 1475462.28it/s]


In [13]:
unsupervised_fliename = 'unsupervised_fasttext_train.txt'

In [14]:
model = fasttext.train_unsupervised(unsupervised_fliename,
                                    dim=10)

Read 33M words
Number of words:  40986
Number of labels: 0
Progress: 100.0% words/sec/thread:   84242 lr:  0.000000 avg.loss:  0.818649 ETA:   0h 0m 0s  7.6% words/sec/thread:   84928 lr:  0.046213 avg.loss:  1.703918 ETA:   0h 4m17s 16.8% words/sec/thread:   84712 lr:  0.041610 avg.loss:  1.660767 ETA:   0h 3m52s 36.5% words/sec/thread:   84867 lr:  0.031726 avg.loss:  1.469090 ETA:   0h 2m57s 39.1% words/sec/thread:   84854 lr:  0.030426 avg.loss:  1.455538 ETA:   0h 2m49s 41.4% words/sec/thread:   84864 lr:  0.029323 avg.loss:  1.445768 ETA:   0h 2m43s 42.9% words/sec/thread:   84875 lr:  0.028539 avg.loss:  1.439208 ETA:   0h 2m39s 52.9% words/sec/thread:   84923 lr:  0.023563 avg.loss:  1.240107 ETA:   0h 2m11s 59.1% words/sec/thread:   84705 lr:  0.020465 avg.loss:  1.145505 ETA:   0h 1m54s 79.4% words/sec/thread:   84546 lr:  0.010281 avg.loss:  0.941481 ETA:   0h 0m57s 96.7% words/sec/thread:   84341 lr:  0.001636 avg.loss:  0.834397 ETA:   0h 0m 9s


In [15]:
model.save_model('models/ftext_bk_unsupervised_10.bin')

---