This file is the popular base recall. In this recall, we use 3 popular algorithms:
1.the most popular items
2.the most popular items in the most popular product type
3.the most popular items in the most popular product group type

In [8]:
import pandas as pd
import numpy as np

#read data into memory
transactions_train=pd.read_feather("./data/h-and-m-personalized-fashion-recommendations/transactions_train.feature")[['customer_id', 'article_id']].drop_duplicates().reset_index(drop=True)
transactions_train
cust=transactions_train.customer_id.unique()

Unnamed: 0,customer_id,article_id
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0663713001
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0541518023
2,00007d2de826758b65a93dd24ce629ed66842531df6699...,0505221004
3,00007d2de826758b65a93dd24ce629ed66842531df6699...,0685687003
4,00007d2de826758b65a93dd24ce629ed66842531df6699...,0685687004
...,...,...
27306434,ffd4cf2217de4a0a3f9f610cdec334c803692a18af08ac...,0856440002
27306435,fff2282977442e327b45d8c89afde25617d00124d0f999...,0929511001
27306436,fff380805474b287b05cb2a7507b9a013482f7dd0bce0e...,0918325001
27306437,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,0833459002


In [4]:
def candidate_score(candidate_df, valid_df):
    """
    This function is used to calculate the validation recall scores,including:
    Recall, Precision, MAP12, Full Covered Customer, Not Covered Customer, Candidate multiplier
    :param candidate_df: The DataFrame validated
    :param valid_df: The validation DataFrame
    :return: all the recall scores in a tuple
    """
    both_df = candidate_df.merge(valid_df, on = ['customer_id', 'article_id'], how = 'inner')#inner connection
    # if the customer have >=12 just count 12 
    trgt_cnt_df = valid_df.groupby('customer_id', as_index = False).agg({'article_id' :'count'}).\
        rename(columns = {'article_id':'trgt_cnt'})
    trgt_cnt_df.loc[trgt_cnt_df['trgt_cnt']>= 12, 'trgt_cnt'] = 12 
    both_cnt_df = both_df.groupby('customer_id', as_index = False).agg({'article_id' :'count'}).\
        rename(columns = {'article_id':'both_cnt'})
    both_cnt_df.loc[both_cnt_df['both_cnt']>= 12, 'both_cnt'] = 12

    trgt_cnt_df = trgt_cnt_df.merge(both_cnt_df, on = 'customer_id', how = 'left') 
    trgt_cnt_df.fillna(0, inplace = True)
    can_df=candidate_df.groupby('customer_id', as_index = False).agg({'article_id' :'count'}).\
        rename(columns = {'article_id':'can_cnt'})
    # assume it is optimally sorted 
    trgt_cnt_df['AP12'] = trgt_cnt_df['both_cnt'] / trgt_cnt_df['trgt_cnt']
    max_map12 = trgt_cnt_df['AP12'].mean()
    recall=trgt_cnt_df['both_cnt'].sum()/trgt_cnt_df['trgt_cnt'].sum()
    precision=trgt_cnt_df['both_cnt'].sum()/can_df['can_cnt'].sum()
    f1=(2*recall*precision)/(recall+precision)
    full_covered_cust = len(trgt_cnt_df.loc[trgt_cnt_df['AP12'] == 1])
    not_covered_cust = len(trgt_cnt_df.loc[trgt_cnt_df['AP12'] == 0])
    num_target_cust = len(trgt_cnt_df)
    num_candidate = len(candidate_df)
    num_target = len(valid_df)
    num_unq_artc = len(candidate_df['article_id'].unique())
    
    print("Recall:{0},Precision:{1},F1:{2}".format(recall,precision,f1))
    print(f"MAX MAP12: {round(max_map12, 4)}")
    print(f"Full Covered Customer: {round(full_covered_cust / num_target_cust, 4)} ({full_covered_cust} / {num_target_cust}) ")
    print(f"Not Covered Customer: {round(not_covered_cust / num_target_cust, 4)} ({not_covered_cust} / {num_target_cust}) ")
    print(f"Candidate multiplier: {round(num_candidate / num_target, 4)} ({num_candidate} / {num_target})")
    print(f"Unique Article Id: {num_unq_artc}")
    
    return max_map12, full_covered_cust / num_target_cust, not_covered_cust / num_target_cust, num_candidate / num_target ,num_unq_artc

In [3]:
#12 most popular articles in 4 weeks
train1 = transactions_train.loc[transactions_train.t_dat >= transactions_train.t_dat.max()- pd.Timedelta(weeks=4)]
top121 = train1.article_id.value_counts().astype('str')[:12]#top121 means top12 in the first popular recall
top121=' '.join(top121.index.tolist())
top121_list=top121.split(' ')
candidate_df = pd.DataFrame({'customer_id': np.repeat(cust, len(top121_list)),
                              'article_id': top121_list * len(cust),
                              })
candidate_score(candidate_df, transactions_train)

'0751471001 0915526001 0915529003 0918292001 0751471043 0706016001 0898694001 0448509014 0909370001 0863595006 0924243001 0918522001'

In [None]:
#the most popular 4 product types
articles=pd.read_csv("./drive/MyDrive/Colab Notebooks/articles.csv",dtype={'article_id': str})
transactions_train_withType=pd.merge(transactions_train,articles[["article_id","product_type_name"]],on="article_id",how="left")
top4=transactions_train_withType.product_type_name.value_counts().index
top4=list(top4[:4])
top4

['Trousers', 'Dress', 'Sweater', 'T-shirt']

In [None]:
#Select the most popular 12 items in the the most popular 4 product types
top12=[]
for top in top4:
  articles_top4=articles[articles["product_type_name"]==top]
  transactions_train_withType=pd.merge(transactions_train,articles_top4,on="article_id",how="inner")
  top1 = transactions_train_withType.article_id.value_counts().index
  top12+=list(top1[:12])
top12=' '.join(top12)
top12

'0706016001 0706016002 0399223001 0706016003 0562245046 0562245001 0399256001 0448509014 0751471001 0573716012 0706016015 0554450001 0716348001 0401044004 0714824001 0612935009 0721298001 0762063001 0883033002 0841434001 0745475001 0880839001 0817353008 0637255001 0673677002 0537116001 0685813001 0591334003 0677930023 0673677004 0591334019 0574109011 0673638001 0679853001 0516903005 0673638007 0610776002 0610776001 0717490008 0711053003 0778064003 0685816002 0778064001 0554598001 0685816001 0806388002 0806388001 0624486001'

In [None]:
#the most popular 4 product group types
transactions_train_withType=pd.merge(transactions_train,articles[["article_id","product_group_name"]],on="article_id",how="left")
top4=transactions_train_withType.product_group_name.value_counts().index
top4=list(top4[:4])
top4

['Garment Upper body', 'Garment Lower body', 'Garment Full body', 'Swimwear']

In [None]:
#Select the most popular 12 items in the the most popular 4 product group types
top122=[]
for top in top4:
  articles_top4=articles[articles["product_group_name"]==top]
  transactions_train_withType=pd.merge(transactions_train,articles_top4,on="article_id",how="inner")
  top1 = transactions_train_withType.article_id.value_counts().index
  top122+=list(top1[:12])
top122=' '.join(top122)
top122

'0610776002 0759871002 0610776001 0568601006 0673677002 0579541001 0507909001 0572797001 0565379001 0717490008 0749699002 0678942001 0706016001 0706016002 0399223001 0706016003 0720125001 0562245046 0562245001 0399256001 0448509014 0751471001 0573716012 0158340001 0294008002 0716348001 0401044004 0714824001 0612935009 0721298001 0762063001 0883033002 0841434001 0745475001 0880839001 0817353008 0351484002 0688537004 0590928001 0599580017 0684209004 0688537011 0600886001 0684209013 0699080001 0689109001 0599580038 0723529001'

In [27]:
#merge all the results above
from tqdm import tqdm
customers=pd.read_csv("./data/h-and-m-personalized-fashion-recommendations/customers.csv")
ALL_USERS=customers.customer_id
def submit():
    preds = []
    for user in tqdm(ALL_USERS):
            preds.append((user, top12+' '+top121+' '+top122))
    df_preds = pd.DataFrame(preds, columns=['customer_id', 'prediction'])
    df_preds.to_feather("./data/Candidate_result/popular_base_total.feather")
    return df_preds
df_preds = submit()
display(df_preds)

100%|██████████| 1371980/1371980 [00:01<00:00, 1204579.94it/s]


Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0706016001 0706016002 0399223001 0706016003 05...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0706016001 0706016002 0399223001 0706016003 05...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0706016001 0706016002 0399223001 0706016003 05...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0706016001 0706016002 0399223001 0706016003 05...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0706016001 0706016002 0399223001 0706016003 05...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0706016001 0706016002 0399223001 0706016003 05...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0706016001 0706016002 0399223001 0706016003 05...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0706016001 0706016002 0399223001 0706016003 05...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0706016001 0706016002 0399223001 0706016003 05...


Now we consider every user case, in this time, we use Google's Colab to accomplish the calculation

In [None]:
import pandas as pd
transactions_train=pd.read_feather("./drive/MyDrive/Colab Notebooks/transactions_train.feature")
# find the most frequent-purchased 4 product types for every user
transactions_train_withType=pd.merge(transactions_train,articles[["article_id","product_type_name"]],on="article_id",how="left")
transactions_train_customer_mostpop=transactions_train_withType.groupby("customer_id")["product_type_name"].value_counts()
transactions_train_customer_mostpop=pd.DataFrame(transactions_train_customer_mostpop)
transactions_train_customer_mostpop.rename(columns={"product_type_name":"count"},inplace=True)
transactions_train_customer_mostpop.reset_index(inplace=True)
transactions_train_customer_mostpop=transactions_train_customer_mostpop.groupby("customer_id").head(4)
transactions_train_customer_mostpop["most_pop"]=transactions_train_customer_mostpop.groupby("customer_id")["product_type_name"].transform(lambda x:",".join(x))
transactions_train_customer_mostpop.to_feather("./drive/MyDrive/Colab Notebooks/transactions_train_customer_mostpop.feather")

In [57]:
import pandas as pd
transactions_train_customer_mostpop=pd.read_feather("./drive/MyDrive/Colab Notebooks/transactions_train_customer_mostpop.feather")
articles=pd.read_csv("./drive/MyDrive/Colab Notebooks/articles.csv",dtype={'article_id': str})
poptables=pd.read_feather("./drive/MyDrive/Colab Notebooks/poptables.feather")
poptables.set_index("product_type_name",inplace=True)
transactions_train_customer_mostpop.drop_duplicates(subset=["customer_id"],inplace=True)
transactions_train_customer_mostpop

In [58]:
#Convert the dataFrame to a list
mostpop_list=transactions_train_customer_mostpop.most_pop.str.split(',', expand=True).values.tolist()
mostpop_list = [list(filter(None, subset)) for subset in mostpop_list]
from tqdm import tqdm
ALL_USERS=transactions_train_customer_mostpop.customer_id

In [None]:
#Now we make a most popular items for every product type
def get_top12(x):
 """
 find the most popular 12items whose product_type is x
 :param x: product type name(string)
 :return: a item list
 """
  print(x)
  articles_top=articles[articles["product_type_name"]==x]
  transactions_train_withType=pd.merge(transactions_train,articles_top[["article_id","product_type_name"]],on="article_id",how="inner")
  top1 = transactions_train_withType.article_id.value_counts().index
  print(top1[:12])
  return top1[:12]

articles=pd.read_csv("./drive/MyDrive/Colab Notebooks/articles.csv",dtype={'article_id': str})
poptables=pd.DataFrame(transactions_train_customer_mostpop.product_type_name.unique(),columns=["product_type_name"])
poptables["most_pop_items"]=poptables["product_type_name"].transform(lambda x:' '.join(get_top12(x).tolist()))
poptables.to_feather("./drive/MyDrive/Colab Notebooks/poptables.feather")

In [59]:
#use the popular table to generate the final result
def submit():
    preds = []
    for i in tqdm(range(len(ALL_USERS))):
      user=ALL_USERS.iloc[i]
      article_list=[]
      pop_type=mostpop_list[i]
      article_2_list=poptables.loc[pop_type].values.tolist()
      article_list=" ".join([article for articles in article_2_list for article in articles])
      preds.append((user, article_list))
    df_preds = pd.DataFrame(preds, columns=['customer_id', 'prediction'])
    df_preds.to_feather("./drive/MyDrive/Colab Notebooks/popular_base_every.feather")
    return df_preds
df_preds = submit()
display(df_preds)

100%|██████████| 1362281/1362281 [08:44<00:00, 2598.58it/s]


Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601006 0568601007 0781613006 0636455003 08...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0590928001 0684209004 0684209013 0699080001 06...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0673677002 0537116001 0685813001 0591334003 06...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0723469001 0564786001 0579302001 0736530007 02...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0487722001 0455832001 0793949002 0487722008 05...
...,...,...
1362276,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0464297007 0719655001 0611415001 0507883009 06...
1362277,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0610776002 0610776001 0717490008 0711053003 07...
1362278,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0706016001 0706016002 0399223001 0706016003 05...
1362279,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0716348001 0401044004 0714824001 0612935009 07...


Now we consider product group_name for every user, and the process is the same as the above

In [79]:
import pandas as pd
transactions_train=pd.read_feather("./drive/MyDrive/Colab Notebooks/transactions_train.feature")
transactions_train_withType=pd.merge(transactions_train,articles[["article_id","product_group_name"]],on="article_id",how="left")
transactions_train_customer_mostpop=transactions_train_withType.groupby("customer_id")["product_group_name"].value_counts()
transactions_train_customer_mostpop=pd.DataFrame(transactions_train_customer_mostpop)
transactions_train_customer_mostpop.rename(columns={"product_group_name":"count"},inplace=True)
transactions_train_customer_mostpop.reset_index(inplace=True)
transactions_train_customer_mostpop=transactions_train_customer_mostpop.groupby("customer_id").head(4)
transactions_train_customer_mostpop["most_pop"]=transactions_train_customer_mostpop.groupby("customer_id")["product_group_name"].transform(lambda x:",".join(x))
transactions_train_customer_mostpop.reset_index(inplace=True)
transactions_train_customer_mostpop.to_feather("./drive/MyDrive/Colab Notebooks/transactions_train_customer_mostpop_group.feather")
# transactions_train_customer_mostpop

In [87]:
transactions_train_customer_mostpop

Unnamed: 0,index,customer_id,product_group_name,count,most_pop
0,0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,Garment Upper body,16,"Garment Upper body,Garment Full body,Garment L..."
1,1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,Garment Full body,2,"Garment Upper body,Garment Full body,Garment L..."
2,2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,Garment Lower body,2,"Garment Upper body,Garment Full body,Garment L..."
3,3,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,Accessories,1,"Garment Upper body,Garment Full body,Garment L..."
4,4,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,Swimwear,33,"Swimwear,Garment Upper body,Garment Full body,..."
...,...,...,...,...,...
3825893,4884144,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,Garment Full body,2,"Garment Full body,Underwear,Garment Lower body..."
3825894,4884145,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,Underwear,2,"Garment Full body,Underwear,Garment Lower body..."
3825895,4884146,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,Garment Lower body,1,"Garment Full body,Underwear,Garment Lower body..."
3825896,4884147,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,Garment Upper body,1,"Garment Full body,Underwear,Garment Lower body..."


In [88]:

def get_top12(x):
  print(x)
  articles_top=articles[articles["product_group_name"]==x]
  transactions_train_withType=pd.merge(transactions_train,articles_top[["article_id","product_group_name"]],on="article_id",how="inner")
  top1 = transactions_train_withType.article_id.value_counts().index
  print(top1[:12])
  return top1[:12]

articles=pd.read_csv("./drive/MyDrive/Colab Notebooks/articles.csv",dtype={'article_id': str})
poptables=pd.DataFrame(transactions_train_customer_mostpop.product_group_name.unique(),columns=["product_group_name"])
poptables["most_pop_items"]=poptables["product_group_name"].transform(lambda x:' '.join(get_top12(x).tolist()))
poptables.to_feather("./drive/MyDrive/Colab Notebooks/poptables_group.feather")

Garment Upper body
Index(['0610776002', '0759871002', '0610776001', '0568601006', '0673677002',
       '0579541001', '0507909001', '0572797001', '0565379001', '0717490008',
       '0749699002', '0678942001'],
      dtype='object')
Garment Full body
Index(['0294008002', '0716348001', '0401044004', '0714824001', '0612935009',
       '0721298001', '0762063001', '0883033002', '0841434001', '0745475001',
       '0880839001', '0817353008'],
      dtype='object')
Garment Lower body
Index(['0706016001', '0706016002', '0399223001', '0706016003', '0720125001',
       '0562245046', '0562245001', '0399256001', '0448509014', '0751471001',
       '0573716012', '0158340001'],
      dtype='object')
Accessories
Index(['0673396002', '0759465001', '0179950001', '0664405002', '0556539003',
       '0516859008', '0759469001', '0556539001', '0589748001', '0743098001',
       '0224606019', '0759482001'],
      dtype='object')
Swimwear
Index(['0351484002', '0688537004', '0590928001', '0599580017', '0684209004'

In [1]:
import pandas as pd
transactions_train_customer_mostpop=pd.read_feather("./drive/MyDrive/Colab Notebooks/transactions_train_customer_mostpop_group.feather")
transactions_train_customer_mostpop.drop_duplicates(subset=["customer_id"],inplace=True)
transactions_train_customer_mostpop.reset_index(inplace=True)
articles=pd.read_csv("./drive/MyDrive/Colab Notebooks/articles.csv",dtype={'article_id': str})
poptables=pd.read_feather("./drive/MyDrive/Colab Notebooks/poptables_group.feather")
poptables.set_index("product_group_name",inplace=True)
transactions_train_customer_mostpop.drop_duplicates(subset=["customer_id"],inplace=True)
mostpop_list=transactions_train_customer_mostpop.most_pop.str.split(',', expand=True).values.tolist()
mostpop_list = [list(filter(None, subset)) for subset in mostpop_list]
from tqdm import tqdm
ALL_USERS=transactions_train_customer_mostpop.customer_id
def submit():
    preds = []
    for i in tqdm(range(len(ALL_USERS))):
      user=ALL_USERS.iloc[i]
      article_list=[]
      pop_type=mostpop_list[i]
      article_2_list=poptables.loc[pop_type].values.tolist()
      article_list=" ".join([article for articles in article_2_list for article in articles])
      preds.append((user, article_list))
    df_preds = pd.DataFrame(preds, columns=['customer_id', 'prediction'])
    df_preds.to_feather("./drive/MyDrive/Colab Notebooks/popular_base_every_group.feather")
    return df_preds
df_preds = submit()
display(df_preds)

100%|██████████| 1362281/1362281 [08:39<00:00, 2620.06it/s]


Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0610776002 0759871002 0610776001 0568601006 06...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0351484002 0688537004 0590928001 0599580017 06...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0610776002 0759871002 0610776001 0568601006 06...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0464297007 0719655001 0723469001 0611415001 05...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0610776002 0759871002 0610776001 0568601006 06...
...,...,...
1362276,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0351484002 0688537004 0590928001 0599580017 06...
1362277,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0610776002 0759871002 0610776001 0568601006 06...
1362278,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0610776002 0759871002 0610776001 0568601006 06...
1362279,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0294008002 0716348001 0401044004 0714824001 06...
