In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import os
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm

In [2]:
# https://www.kaggle.com/gpreda/h-m-eda-and-prediction
# For this initial submission, we apply the following simplified logic:

# if there are articles for a certain client, pick the most recent buys;
# if there are not articles for a certain client, just pick the most frequently buyed articles.

In [3]:
%%time
df_articles = pd.read_csv('../data/articles.csv')
df_train = pd.read_csv('../data/transactions_train.csv')
df_cust = pd.read_csv('../data/customers.csv')
df_sub = pd.read_csv('../data/sample_submission.csv')
print(df_articles.shape, df_train.shape, df_cust.shape, df_sub.shape)

(105542, 25) (31788324, 5) (1371980, 7) (1371980, 2)
CPU times: user 25.6 s, sys: 3.02 s, total: 28.6 s
Wall time: 31.9 s


In [4]:
df_articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,4,Dark,5,Black,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,3,Light,9,White,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,1,Dusty Light,9,White,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,4,Dark,5,Black,1339,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,3,Light,9,White,1339,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [5]:
df_articles.iloc[64525]

article_id                                                              742079001
product_code                                                               742079
prod_name                                                Panorama mid support bra
product_type_no                                                               306
product_type_name                                                             Bra
product_group_name                                                      Underwear
graphical_appearance_no                                                   1010016
graphical_appearance_name                                                   Solid
colour_group_code                                                               9
colour_group_name                                                           Black
perceived_colour_value_id                                                       4
perceived_colour_value_name                                                  Dark
perceived_colour

In [6]:
df_train.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [7]:
%%time
df_train = df_train.sort_values(['customer_id', 't_dat'], ascending=False)

CPU times: user 15.2 s, sys: 1.33 s, total: 16.5 s
Wall time: 16.5 s


In [8]:
df_train.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
19867243,2019-12-04,ffffd9ac14e89946416d80e791d064701994755c3ab686...,806050001,0.084729,2
27806865,2020-06-22,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,882810001,0.016932,1
25077914,2020-04-25,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,866755002,0.050831,2
24375394,2020-04-09,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,866755002,0.043203,2
24375395,2020-04-09,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,840360003,0.013542,2


In [9]:
#most recently bought articles
last_date = df_train.t_dat.max()
print(last_date)
print(df_train.loc[df_train.t_dat==last_date].shape)

2020-09-22
(32866, 5)


In [10]:
most_frequent_articles = list(df_train.loc[df_train.t_dat==last_date].article_id.value_counts()[0:12].index)
art_list = []
for art in most_frequent_articles:
    art = "0"+str(art)
    art_list.append(art)
art_str = " ".join(art_list)
print("Frequent articles bought recently: ", art_str)

Frequent articles bought recently:  0924243002 0751471001 0448509014 0918522001 0866731001 0714790020 0788575004 0915529005 0573085028 0918292001 0850917001 0928206001


In [11]:
%%time
agg_df = df_train.groupby(["customer_id"])["article_id"].agg(lambda x: str(x.values[0:12])[1:-1]).reset_index()

CPU times: user 1min 18s, sys: 510 ms, total: 1min 19s
Wall time: 1min 19s


In [12]:
agg_df.head()

Unnamed: 0,customer_id,article_id
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601043 841260003 887593002 890498002 795440...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,826211002 599580055 599580055 811835004 811835...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,794321007 858883002 851400006 750424014 750424...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,742079001 732413001
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,896152002 730683050 927530004 791587015 589440...


In [13]:
agg_df['<12'] = agg_df.article_id.map(lambda x: 1 if len(x.split()) < 12 else 0)

In [14]:
agg_df['<12'].value_counts()

1    750836
0    611445
Name: <12, dtype: int64

In [15]:
def padding_articles(x):
    if x:
        xl = x.split()
        x = []
        for xi in xl:
            x.append("0"+xi)
        dimm_x = len(x)
        if dimm_x < 12:
            x.extend(art_list[:12-dimm_x])
        return(" ".join(x))

In [16]:
agg_df["article_id"] = agg_df["article_id"].apply(lambda x: padding_articles(x))

In [17]:
agg_df.head()

Unnamed: 0,customer_id,article_id,<12
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0841260003 0887593002 0890498002 07...,0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0826211002 0599580055 0599580055 0811835004 08...,0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0858883002 0851400006 0750424014 07...,0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0742079001 0732413001 0924243002 0751471001 04...,1
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0896152002 0730683050 0927530004 0791587015 05...,0


In [18]:
# submission_df = agg_df.merge(df_sub[["customer_id"]], how="right")
# submission_df.columns = ["customer_id", "prediction"]
# print(submission_df.shape)
# submission_df.head()

In [19]:
# print("Rows with missing data in submission: ", submission_df.loc[submission_df.prediction.isna()].shape[0])

In [20]:
# submission_df.loc[submission_df.prediction.isna(), ["prediction"]] = art_str

In [21]:
# print("Rows with missing data in submission: ", submission_df.loc[submission_df.prediction.isna()].shape[0])

In [22]:
# submission_df.to_csv("../submissions/sub1.csv", index=False)

In [23]:
# !kaggle competitions submit -c 'h-and-m-personalized-fashion-recommendations' -f '../submissions/sub1.csv' -m 'baseline'

In [24]:
# !kaggle competitions submissions -c h-and-m-personalized-fashion-recommendations

In [25]:
%%time
agg_df = df_train.groupby(["customer_id"])["article_id"].agg(lambda x: str(x.values[0:12])[1:-1]).reset_index()
agg_df['<12'] = agg_df.article_id.map(lambda x: 1 if len(x.split()) < 12 else 0)

CPU times: user 1min 20s, sys: 519 ms, total: 1min 21s
Wall time: 1min 21s


In [26]:
agg_df.head()

Unnamed: 0,customer_id,article_id,<12
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601043 841260003 887593002 890498002 795440...,0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,826211002 599580055 599580055 811835004 811835...,0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,794321007 858883002 851400006 750424014 750424...,0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,742079001 732413001,1
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,896152002 730683050 927530004 791587015 589440...,0


In [27]:
agg_df['<12'].value_counts()

1    750836
0    611445
Name: <12, dtype: int64

In [28]:
agg_df['<12_l'] = agg_df.article_id.map(lambda x: len(x.split()) if len(x.split()) < 12 else 0)

In [29]:
agg_df['<12_l'].value_counts()

0     611445
1     131514
2     127441
3      95686
4      82082
5      64635
6      56820
7      47385
8      43047
9      37627
10     34262
11     30337
Name: <12_l, dtype: int64

In [30]:
df_articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,4,Dark,5,Black,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,3,Light,9,White,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,1,Dusty Light,9,White,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,4,Dark,5,Black,1339,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,3,Light,9,White,1339,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [31]:
agg_df_l12_1 = agg_df.loc[agg_df['<12_l'] != 0]

In [32]:
agg_df_l12_1.shape

(750836, 4)

In [33]:
agg_df_l12_1_2 = agg_df_l12_1.loc[agg_df_l12_1['<12_l'] == 2]

In [34]:
agg_df_l12_1_2.shape

(127441, 4)

In [35]:
agg_df_l12_1.head()

Unnamed: 0,customer_id,article_id,<12,<12_l
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,742079001 732413001,1,2
5,000064249685c11552da43ef22a5030f35a147f723d5b0...,738133005 680265002 740962001,1,3
6,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,719530003 448509014 715624008 783388001 735843...,1,6
8,00007e8d4e54114b5b2a9b51586325a8d0fa74ea23ef77...,819423001 850614001,1,2
9,00008469a21b50b3d147c97135e25b4201a8c58997f787...,673677001 551080020 648414023 673677004,1,4


In [36]:
agg_df_l12_1_2.head()

Unnamed: 0,customer_id,article_id,<12,<12_l
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,742079001 732413001,1,2
8,00007e8d4e54114b5b2a9b51586325a8d0fa74ea23ef77...,819423001 850614001,1,2
14,0000ae1bbb25e04bdc7e35f718e852adfb3fbb72ef38b3...,573085038 573085033,1,2
16,0000b7a134c3ec0d8842fad1fd4ca28517424c14fc4848...,745477001 651300005,1,2
33,000197360fe727d2cc0887073db9062bf37a8949a95c4a...,815026002 815026002,1,2


In [37]:
# df_articles['article_id'] = df_articles['article_id'].astype(str)

In [38]:
for idx, row in agg_df_l12_1.reset_index(drop=True).iterrows():
    if idx != 0:
        continue
    cust_id = row['customer_id']
    articles = row['article_id'].split()
#     print(articles)
    articles = [int(s) for s in articles]
#     print(articles)
    df_articles_cust = df_articles.loc[df_articles['article_id'].isin(articles)]
    print(df_articles_cust.shape)
    print(df_articles_cust)
    

(2, 25)
       article_id  product_code                 prod_name  product_type_no  \
61175   732413001        732413       PANORAMA sports bra              306   
64525   742079001        742079  Panorama mid support bra              306   

      product_type_name product_group_name  graphical_appearance_no  \
61175               Bra          Underwear                  1010016   
64525               Bra          Underwear                  1010016   

      graphical_appearance_name  colour_group_code colour_group_name  \
61175                     Solid                  9             Black   
64525                     Solid                  9             Black   

       perceived_colour_value_id perceived_colour_value_name  \
61175                          4                        Dark   
64525                          4                        Dark   

       perceived_colour_master_id perceived_colour_master_name  department_no  \
61175                           5                   

In [39]:
df_articles.loc[(df_articles.product_type_no == 306) & (df_articles.product_type_name == 'Bra') & (df_articles.product_group_name == 'Underwear') & (df_articles.department_name.isin(['Ladies Sport Bras']))]

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
3182,449744019,449744,Karin Medium Support Bra,306,Bra,Underwear,1010016,Solid,52,Pink,1,Dusty Light,1,Mole,8316,Ladies Sport Bras,S,Sport,26,Sport,5,Ladies H&M Sport,1005,Jersey Fancy,Sports bra in fast-drying functional fabric wi...
3183,449744020,449744,Karin Medium Support Bra,306,Bra,Underwear,1010016,Solid,73,Dark Blue,4,Dark,2,Blue,8316,Ladies Sport Bras,S,Sport,26,Sport,5,Ladies H&M Sport,1005,Jersey Fancy,Sports bra in fast-drying functional fabric wi...
3787,466381010,466381,Greenville bra (1),306,Bra,Underwear,1010016,Solid,61,Light Purple,3,Light,4,Pink,8316,Ladies Sport Bras,S,Sport,26,Sport,5,Ladies H&M Sport,1005,Jersey Fancy,Sports bra in fast-drying functional fabric wi...
3788,466381011,466381,Greenville bra (1),306,Bra,Underwear,1010016,Solid,83,Dark Turquoise,4,Dark,7,Turquoise,8316,Ladies Sport Bras,S,Sport,26,Sport,5,Ladies H&M Sport,1005,Jersey Fancy,Sports bra in fast-drying functional fabric wi...
3789,466381012,466381,Greenville bra (1),306,Bra,Underwear,1010016,Solid,9,Black,4,Dark,5,Black,8316,Ladies Sport Bras,S,Sport,26,Sport,5,Ladies H&M Sport,1005,Jersey Fancy,Sports bra in fast-drying functional fabric wi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99186,882902001,882902,Orange Push Bralette,306,Bra,Underwear,1010016,Solid,9,Black,4,Dark,5,Black,8316,Ladies Sport Bras,S,Sport,26,Sport,5,Ladies H&M Sport,1005,Jersey Fancy,Sports bralette in fast-drying functional fabr...
99187,882902003,882902,Orange Push Bralette,306,Bra,Underwear,1010016,Solid,92,Green,2,Medium Dusty,12,Grey,8316,Ladies Sport Bras,S,Sport,26,Sport,5,Ladies H&M Sport,1005,Jersey Fancy,Sports bralette in fast-drying functional fabr...
101107,894355001,894355,lucy Zipped Bra Conscious,306,Bra,Underwear,1010016,Solid,9,Black,4,Dark,5,Black,8316,Ladies Sport Bras,S,Sport,26,Sport,5,Ladies H&M Sport,1005,Jersey Fancy,Fully lined sports bra in fast-drying function...
103330,910949002,910949,sidney bonded highsupport bra,306,Bra,Underwear,1010016,Solid,9,Black,4,Dark,5,Black,8316,Ladies Sport Bras,S,Sport,26,Sport,5,Ladies H&M Sport,1005,Jersey Fancy,Sports bra in fast-drying functional fabric wi...


In [40]:
def get_articles_similar_to_article(article):
    df_articles_cust = df_articles.loc[df_articles['article_id'].isin(article)].reset_index(drop=True)
    print(df_articles_cust.shape)
    # print(df_articles_cust)
#     print(df_articles_cust.value_counts())
    vc_dict = {}
#     print(df_articles_cust.loc[0, 'prod_name'])
#     print(set(df_articles_cust.loc[0, 'prod_name'].lower().split()).intersection(set(df_articles_cust.loc[1, 'prod_name'].lower().split())))
    vc_dict['prod_name'] = set(df_articles_cust.loc[0, 'prod_name'].lower().split())

#     print(df_articles_cust['product_type_no'].value_counts().to_dict())
    vc_dict['product_type_no'] = df_articles_cust['product_type_no'].value_counts().to_dict()
    
    
#     print(df_articles_cust['product_group_name'].value_counts().to_dict())
    vc_dict['product_group_name'] = df_articles_cust['product_group_name'].value_counts().to_dict()
    
#     print(df_articles_cust['graphical_appearance_no'].value_counts().to_dict())
    vc_dict['graphical_appearance_no'] = df_articles_cust['graphical_appearance_no'].value_counts().to_dict()
    
#     print(df_articles_cust['colour_group_code'].value_counts().to_dict())
    vc_dict['colour_group_code'] = df_articles_cust['colour_group_code'].value_counts().to_dict()
    
#     print(df_articles_cust['perceived_colour_value_id'].value_counts().to_dict())
    vc_dict['perceived_colour_value_id'] = df_articles_cust['perceived_colour_value_id'].value_counts().to_dict()
    
#     print(df_articles_cust['perceived_colour_master_id'].value_counts().to_dict())
    vc_dict['perceived_colour_master_id'] = df_articles_cust['perceived_colour_master_id'].value_counts().to_dict()
    
#     print(df_articles_cust['department_no'].value_counts().to_dict())
    vc_dict['department_no'] = df_articles_cust['department_no'].value_counts().to_dict()
    
#     print(df_articles_cust['index_code'].value_counts().to_dict())
    vc_dict['index_code'] = df_articles_cust['index_code'].value_counts().to_dict()
    
#     print(df_articles_cust['index_group_no'].value_counts().to_dict())
    vc_dict['index_group_no'] = df_articles_cust['index_group_no'].value_counts().to_dict()
    
#     print(df_articles_cust['section_no'].value_counts().to_dict())
    vc_dict['section_no'] = df_articles_cust['section_no'].value_counts().to_dict()
    
#     print(df_articles_cust['garment_group_no'].value_counts().to_dict())
    vc_dict['garment_group_no'] = df_articles_cust['garment_group_no'].value_counts().to_dict()

In [None]:
agg_df_l12_1_2_copy = agg_df_l12_1_2.copy()
for idx, (index, row) in tqdm(enumerate(agg_df_l12_1_2_copy.iterrows())):
#     if idx != 0:
#         continue
#     print(idx, index, row)
    #print('-------------------')
    cust_id = row['customer_id']
    articles = row['article_id'].split()
#     print(articles)
    articles = [int(s) for s in articles]
    #print(articles)
    if articles[0] == articles[1]:
#         get_articles_similar_to_article(articles[0])
        articles = [articles.append(articles[0]) for i in range(10)]
        articles = ['0'+str(a) for a in articles]
        agg_df_l12_1_2.loc[index, 'article_id'] = ' '. join(articles)
        continue
                    
    df_articles_cust = df_articles.loc[df_articles['article_id'].isin(articles)].reset_index(drop=True)
    #print(df_articles_cust.shape)
    # print(df_articles_cust)
#     print(df_articles_cust.value_counts())
    vc_dict = {}
#     print(df_articles_cust.loc[0, 'prod_name'])
#     print(set(df_articles_cust.loc[0, 'prod_name'].lower().split()).intersection(set(df_articles_cust.loc[1, 'prod_name'].lower().split())))
    vc_dict['prod_name'] = set(df_articles_cust.loc[0, 'prod_name'].lower().split()).intersection(set(df_articles_cust.loc[1, 'prod_name'].lower().split()))

    l = []
    l.append(df_articles_cust.loc[0, 'prod_name'].lower())
    l.append(df_articles_cust.loc[1, 'prod_name'].lower())

    vc_dict['prod_name_u'] = l
    
#     print(df_articles_cust['product_type_no'].value_counts().to_dict())
    vc_dict['product_type_no'] = df_articles_cust['product_type_no'].value_counts().to_dict()
    
    
#     print(df_articles_cust['product_group_name'].value_counts().to_dict())
    vc_dict['product_group_name'] = df_articles_cust['product_group_name'].value_counts().to_dict()
    
#     print(df_articles_cust['graphical_appearance_no'].value_counts().to_dict())
    vc_dict['graphical_appearance_no'] = df_articles_cust['graphical_appearance_no'].value_counts().to_dict()
    
#     print(df_articles_cust['colour_group_code'].value_counts().to_dict())
    vc_dict['colour_group_code'] = df_articles_cust['colour_group_code'].value_counts().to_dict()
    
#     print(df_articles_cust['perceived_colour_value_id'].value_counts().to_dict())
    vc_dict['perceived_colour_value_id'] = df_articles_cust['perceived_colour_value_id'].value_counts().to_dict()
    
#     print(df_articles_cust['perceived_colour_master_id'].value_counts().to_dict())
    vc_dict['perceived_colour_master_id'] = df_articles_cust['perceived_colour_master_id'].value_counts().to_dict()
    
#     print(df_articles_cust['department_no'].value_counts().to_dict())
    vc_dict['department_no'] = df_articles_cust['department_no'].value_counts().to_dict()
    
#     print(df_articles_cust['index_code'].value_counts().to_dict())
    vc_dict['index_code'] = df_articles_cust['index_code'].value_counts().to_dict()
    
#     print(df_articles_cust['index_group_no'].value_counts().to_dict())
    vc_dict['index_group_no'] = df_articles_cust['index_group_no'].value_counts().to_dict()
    
#     print(df_articles_cust['section_no'].value_counts().to_dict())
    vc_dict['section_no'] = df_articles_cust['section_no'].value_counts().to_dict()
    
#     print(df_articles_cust['garment_group_no'].value_counts().to_dict())
    vc_dict['garment_group_no'] = df_articles_cust['garment_group_no'].value_counts().to_dict()
    
    imp_cols = []
    imp_vals = []
    for key in vc_dict.keys():
        if (key == 'prod_name') | (key == 'prod_name_u'):
            continue
        d = vc_dict[key]
        for k in d.keys():
            if d[k] == 2:
                imp_cols.append(key)
                imp_vals.append(k)
    prod_name_l = list(vc_dict['prod_name'])
    prod_name_u = vc_dict['prod_name_u']
    #print(prod_name_l)
    #print(prod_name_u)
#     print(prod_name_l)
#     df_2 = df_articles.loc[df_articles['prod_name'].str.lower().str.contains('|'.join(prod_name_l))]
    df_2 = df_articles.copy()
    # df_2 = df_articles.loc[df_articles['prod_name'].str.lower().str.contains('|'.join(prod_name_u))]
    for el in prod_name_l:
        el = el.replace('(', '')
        el = el.replace(')', '')
        if el in ['+']:
            continue
        df_2 = df_2.loc[df_2['prod_name'].str.lower().str.contains(el)]
        # print(df_2.shape)
        # print('********')
    # print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
    # print(df_2.shape)
    if df_2.shape[0] <= 10:
        articles.extend(list(df_2.article_id.values))
        articles = ['0'+str(a) for a in articles]
        if len(articles) < 12:
            articles.extend(art_list[:12-len(articles)])
            agg_df_l12_1_2.loc[index, 'article_id'] = ' '. join(articles)
            continue
    df_21 = df_2.copy()
    for col, val in zip(imp_cols, imp_vals):
#         print(col, val)
        df_21 = df_21.loc[df_21[col] == val]
#         print(df_21.shape)
        
    #print(df_21.shape)
    if (df_21.shape[0] <= 10) & (len(articles) < 10):
        articles.extend(list(df_21.article_id.values))
        articles = ['0'+str(a) for a in articles]
        if len(articles) < 12:
            articles.extend(art_list[:12-len(articles)])
            agg_df_l12_1_2.loc[index, 'article_id'] = ' '. join(articles)
            continue
    elif (df_21.shape[0] > 10) & (len(articles) < 10):
        l = list(df_21.article_id.values)
        articles.extend(l[:12-len(articles)])
        articles = ['0'+str(a) for a in articles]
#         print(index, articles)
#         print(agg_df_l12_1_2.loc[index])
        agg_df_l12_1_2.loc[index, 'article_id'] = ' '. join(articles)
        continue
    else:
        continue

0it [00:00, ?it/s]

In [None]:
df_articles.shape

In [None]:
df_articles.loc[df_articles['article_id'] == 742079001]

In [None]:
df_articles.info()

In [None]:
df_articles.iloc[64525]