In [None]:
import re
import numpy as np
from collections import Counter
import pandas as pd
import time
from tqdm.auto import tqdm
from unidecode import unidecode
from IPython.display import display, HTML  
display(HTML("<style>.container { width:100% !important; }</style>")) # widht jupyter notebook
pd.set_option('display.max_colwidth', None) # width dataframe as output

# 0- Fetch and clean Data

In [None]:
# Set up File names


data_niq_path = r'data/pm_result_cosmetics.csv' #'breadcrumb_fr', product_name
regex_model_path = r'data/regex_model_cosmetics_full.csv' 

data_extract_path = r'data/training_dataset_cosmetics.csv' # 'product_concat','breadcrumb_aggregate'
#df_volume_checks_path = r'data/volume_checks_cosmetics_full.csv'





perc_max_prod_catch_from_other_bc = 0.01 #0.05 #check the perc of products catch from other bc. If higher than x% then pop.
regex_count_threshold = 10 # >= 2



#### list breadcrumbs inside our Fox Categ - in French and with 4 levels ####

category_breadcrumb = [
"beaute et parfum_maquillage_yeux_kits",
"beaute et parfum_maquillage_teint_kits",
"beaute et parfum_maquillage_levres_kits",
"beaute et parfum_maquillage_teint_blush",
"beaute et parfum_maquillage_levres_gloss",
"beaute et parfum_maquillage_yeux_mascara",
"beaute et parfum_maquillage_corps_bronzer",
"beaute et parfum_maquillage_teint_bronzer",
"beaute et parfum_maquillage_yeux_sourcils",
"beaute et parfum_maquillage_yeux_eye-liner",
"beaute et parfum_maquillage_yeux_faux cils",
"beaute et parfum_maquillage_teint_correcteurs",
"beaute et parfum_maquillage_corps_illuminateur",
"beaute et parfum_maquillage_levres_crayon a levre",
"beaute et parfum_maquillage_levres_rouge a levres",
"beaute et parfum_maquillage_yeux_fard a paupieres",
"beaute et parfum_maquillage_coffrets de maquillage",
"beaute et parfum_maquillage_palettes de maquillage",
"beaute et parfum_maquillage_teint_poudre de visage",
"beaute et parfum_maquillage_corps_poudre de finition",
"beaute et parfum_maquillage_corps_decoration de la peau",
"beaute et parfum_maquillage_levres_volumateur de levres",
"beaute et parfum_maquillage_teint_fond de teint et base",
"beaute et parfum_maquillage_corps_fond de teint - poudre",
"beaute et parfum_maquillage_corps_fond de teint - liquide",
"beaute et parfum_accessoires et outils_accessoires de maquillage et outils",
"beaute et parfum_vernis a ongles et manucure_soins pour les ongles",
"beaute et parfum_vernis a ongles et manucure_decorations et accessoires",
"beaute et parfum_vernis a ongles et manucure_faux ongles et accessoires",
"beaute et parfum_vernis a ongles et manucure_kits de soins pour les ongles",
"beaute et parfum_vernis a ongles et manucure_decorations et accessoires_dissolvant",
"beaute et parfum_vernis a ongles et manucure_faux ongles et accessoires_faux ongles",
"beaute et parfum_vernis a ongles et manucure_decorations et accessoires_vernis a ongles",
"beaute et parfum_vernis a ongles et manucure_decorations et accessoires_blanchisseur d'ongles",
"beaute et parfum_vernis a ongles et manucure_machines electriques pour manucures et pedicures",
"beaute et parfum_vernis a ongles et manucure_decorations et accessoires_decorations et accessoires"]


In [None]:
# add _uncategorized until level 3 for df_bc_regex
def add_uncategorized(bc):
    if len(re.findall("_", bc)) == 3 :
        return(bc)
    elif len(re.findall("_", bc)) == 2 :
        return(bc + "_uncategorized")
    elif len(re.findall("_", bc)) == 1 :
        return(bc + "_uncategorized_uncategorized")
    elif len(re.findall("_", bc)) == 0 :
        return(bc + "_uncategorized_uncategorized_uncategorized")
    else :
        return(bc)

##### ONLY APPLIES WHEN SPLITTING BC IN OUT SCOPE IN REGEX MODEL GENERATION ######
# update breadcrumbs for in scope = keep full breadcrumb ; out scope = categ0_uncat_uncat_uncat
def split_bc_in_out_scope(bc, category_breadcrumb=category_breadcrumb):
    category_0_scope =[]
    for breadcrumb in category_breadcrumb : 
        category_0_scope.append(breadcrumb.split("_")[0])
    
    if bc.split("_")[0] not in category_0_scope :
        return(f'{bc.split("_")[0]}_uncategorized_uncategorized_uncategorized')
    else :
        return(bc)

In [None]:
# Import data_niq sent by NIQ with product_name and modules/breadcrumb 
data_niq = pd.read_csv(data_niq_path)
data_niq = data_niq.rename(columns= {'breadcrumb_fr':'breadcrumb'})
data_niq = data_niq.drop_duplicates()
data_niq = data_niq.dropna() # removing empty product_name

#data_niq['product_name'].replace('', np.nan, inplace=True)   
data_niq.dropna(subset=['product_name'], inplace=True)
data_niq = data_niq.apply(lambda x: x.astype(str).str.lower())    # and double check that all chars are lowercase
data_niq['breadcrumb'] = data_niq['breadcrumb'].apply(add_uncategorized)
##### ONLY APPLIES WHEN SPLITTING BC IN OUT SCOPE IN REGEX MODEL GENERATION ######
data_niq['breadcrumb'] = data_niq['breadcrumb'].apply(split_bc_in_out_scope)

# 1- Build the regex model

In [None]:
# function to clean product_name
def product_cleaner(product_name):
    product_name_clean = product_name.lower() #lower case
    product_name_clean = unidecode(product_name_clean) # convert all chars to ascii (é -> e , ù -> u)
    
    ########### short way with .*
    product_name_clean = re.sub(r"_|[^\w\s]+", r" ", product_name_clean) # remove special chars 
    product_name_clean = re.sub(r"\d", r" ", product_name_clean)  # remove digits
    product_name_clean = re.sub(r"\b.\b", r" ", product_name_clean) # remove unique char
    
    # Remove Stopwords
    # Global
    product_name_clean = re.sub(r"\b(cm|kg|cl|ml)\b", r" ", product_name_clean) 
    # EN
    product_name_clean = re.sub(r"\b(and|for|with|per|or|to|by|not|in|all|the)\b", r" ", product_name_clean) 
    # FR
    product_name_clean = re.sub(r"\b(par|de|sous|ou|et|en|la|le|au)\b", r" ", product_name_clean) 
    # DE
    product_name_clean = re.sub(r"\b(und|mit|der|das|die)\b", r" ", product_name_clean) 
    # ES & IT
    product_name_clean = re.sub(r"\b(sin|con|di|da|el|del|para|por|el)\b", r" ", product_name_clean) 

    
    ########### short way
    
    
    ########### long way
    #product_name_clean = re.sub(r"_|[^\w\s]+", r".?", product_name_clean) # special chars to single space .?  we can't replace special chars by spaces as later we can combine 2 words next to each others
    #product_name_clean = re.sub(r"\d+", r"\\d+", product_name_clean)  # number(s) to \d+
    #product_name_clean = re.sub(r"\sand\s", r" (and\\s)?", product_name_clean)  
    #product_name_clean = re.sub(r"\sund\s", r" (und\\s)?", product_name_clean)  
    #product_name_clean = re.sub(r"\set\s", r" (et\\s)?", product_name_clean) 
    #product_name_clean = re.sub(r"\si\s", r" (i\\s)?", product_name_clean) 
    ########### long way

    product_name_clean = re.sub(r"\s+", r" ", product_name_clean)  # space(s) to single space [ ]
    #product_name_clean = re.sub(r"(y|ies)(\b)", r"(y|ie?s?) ", product_name_clean)  # for english words : plural "ies" and singular "y" to "(y|ie?s?)""
    #product_name_clean = re.sub(r"s(\b)", r"s? ", product_name_clean)  # plural "s" to "s?"

    return product_name_clean

########################################################
#product_name = "hello Alex and Tom 12 ml and,,,,, +334 at $1 + yolo   in-the_place!!!!candies and french fries \\ / |in the %&pastes"
product_name = "hello,,,,, yos l de "
product_clean = product_cleaner(product_name)
print(product_clean)
########################################################

In [None]:
#function to extract all possible regex by product_name
def regex_constructor(product_name):
    word_list = product_name.split()
    regex_list = []

    if len(word_list) > 0 : 
        # word\sword -> "how are you" to [how, how\sare, how\sare\syou, are, are\syou, you]
        index = 0
        for word in word_list :
            index += 1
            for check_index in range(index,len(word_list)+1) : 
                new_regex = [word] + word_list[index:check_index]
                new_regex = '\\s'.join(new_regex)
                regex_list.append(new_regex)

        # word.*word -> "how are you" to [how.*are, how.*you, are.*you]
        index = 0
        for word in word_list :
            index += 1
            for check_index in range(index+1,len(word_list)+1) : 
                new_regex = word + ".*" + word_list[check_index-1]
                regex_list.append(new_regex)

        # ^word\sword -> "how are you" to [^how, ^how.*are, ^how.*you]
        index = 1
        for check_index in range(index,len(word_list)) : 
            first_word = "^" + word_list[0]
            other_word = word_list[check_index]
            new_regex = [first_word] + [other_word]
            new_regex = '.*'.join(new_regex)
            regex_list.append(new_regex)

        # word.*word\sword -> "how are you guys" to [how.*are\syou, how.*you\sguys, are.*you\sguys]
        #index = 0
        #for word in word_list :
        #    index += 1
        #    for check_index in range(index+1,len(word_list)) : 
        #        new_regex = word + ".*" + word_list[check_index-1] + "\s" + word_list[check_index]
        #        regex_list.append(new_regex)


        # word\sword.*word -> "how are you guys" to [how\\sare.*you, how\\sare.*guys, are\\syou.*guys]
        #for index in range(0,len(word_list)-2) :
        #    for check_index in range(index+2,len(word_list)):   
        #        new_regex = word_list[index] + "\\s" + word_list[index+1] + ".*" + word_list[check_index]
        #        regex_list.append(new_regex)


        # ^word\sword -> "how are you" to [^how, ^how\sare, ^how\sare\syou]
        #index = 1
        #first_word = "^" + word_list[0]
        #for check_index in range(index,len(word_list)+1) : 
        #    new_regex = [first_word] + word_list[index:check_index]
        #    new_regex = '\\s'.join(new_regex)
        #    regex_list.append(new_regex)
        
            
        # ^word\sword -> "how are you" to [^how, ^how.*are, ^how.*are.*you]
        #index = 1
        #first_word = "^" + word_list[0]
        #for check_index in range(index,len(word_list)+1) : 
        #    new_regex = [first_word] + word_list[index:check_index]
        #    new_regex = '.*'.join(new_regex)
        #    regex_list.append(new_regex)

        regex_list = list(set(regex_list)) #remove duplicates from the regex list (for example in case of 2 similar words in the produc_name)
        return regex_list

########################################################
regex_list = regex_constructor(product_clean)
regex_list
########################################################

In [None]:
"""
%%time
# EXTRACT {bc1 : [(regex1,10), (regex2, 5)], bc10 : [(regex11,10), (regex12, 5)]} from NIQ files 
regex_count_threshold = regex_count_threshold # >= 2


df = data_niq.copy()

# create a list of all regex possibilities per bc
bc_regex = {k: [] for k in df['breadcrumb'].unique()}
for index, row in tqdm(df.iterrows(), desc='Regex List Creation'):
    bc = row['breadcrumb']
    product_name = product_cleaner(row['product_name'])  # clean the product_name
    regex_list = regex_constructor(product_name)         # build the list of possible regex for current product_name
    if regex_list is not None :
        bc_regex[bc].append(regex_list)

# from a list with all possible regex, change to a list of distinct regex with count, and remove regex with lower count than regex_count_threshold
bc_regex_count = {k: [] for k in df['breadcrumb'].unique()}
for bc, regex in tqdm(bc_regex.items(), desc='Regex List Count Creation') :
    regex_list = sum(regex, []) # replace mutliple list of values into one list {bc : [[regex1, regex2][regex3,regex4]]} to {bc : [regex1, regex2, regex3,regex4]}
    regex_count = Counter(regex_list)   # count how many distinct regex per bc
    regex_count = sorted(regex_count.items(), key=lambda x:x[1], reverse=True) #sort in descending order the values
    regex_count_with_threshold = []

    for rx in regex_count :     # remove all regex_count with minimum of n 
        if rx[1] >= regex_count_threshold:
            regex_count_with_threshold.append(rx)
  
    bc_regex_count[bc] = regex_count_with_threshold

# check if regex doesn't match product_names from other bc, and output a dict with key= bc and value= list of valid regex
data_bc_regex_valid = []
bc_regex_valid = {}
for bc, regex_list in tqdm(bc_regex_count.items(), desc='Check if no match other BC'):
    regex_valid = dict(regex_list)                # we convert from list of list of tuples to dictionary of key : value because of the pop function to use later
    df_diff_bc = df.loc[df['breadcrumb'] != bc]   # build df with different bc

    for regex in tqdm(list(regex_valid), desc=f'--Loop over regex_valid for {bc}'): # we convert regex_valid to list because the length is moving when iterating the loop
        try :
            r = re.compile(f'.*{regex}.*')
            r_list = list(filter(r.match, list(df_diff_bc['product_name'])))    # we use this regex construction to catch all matches inside a list, rather than just one string
            # previous method. very restrictiv
            #if len(r_list) > 0: # if we catch at least one string, then remove the regex from the regex_valid list
            # new method less restrictive
            #check the perc of products catch from other bc. If higher than x% then pop.
            if len(r_list) / regex_valid[regex] > perc_max_prod_catch_from_other_bc : # number of product catch in other bc / number of products catch in the bc.
                regex_valid.pop(regex)

        except re.error:    # if one regex is problematci (for example *) then remove from regex_valid list
            regex_valid.pop(regex) 
            #print(f'{regex[0]} not valid')
            continue


    regex_valid_list = [(k, v) for k, v in regex_valid.items()] # convert list of dict to list of list of tuples 
    data_bc_regex_valid.append({'breadcrumb': bc, 'regex_list':regex_valid_list})

    df_clean = pd.DataFrame.from_dict(data_bc_regex_valid)
    df_clean.to_csv(regex_model_path)

df_clean
"""

# 2- Generate the BR with Categ Extract and Regex Model


In [None]:
############################################################################################################

bc_origin = "beaute et parfum_maquillage_coffrets de maquillage_uncategorized"
bc_dest = "beaute et parfum_maquillage_yeux_mascara"


############################################################################################################

In [None]:
# Fetch the data

data_bc_regex = pd.read_csv(regex_model_path)
data_bc_regex['breadcrumb'] = data_bc_regex['breadcrumb'].apply(add_uncategorized)

data_extract = pd.read_csv(data_extract_path)
data_extract = data_extract[['product_concat', 'breadcrumb_aggregate']]
data_extract = data_extract.dropna()
data_extract = data_extract.rename(columns= {'product_concat':'product_name','breadcrumb_aggregate':'breadcrumb'})
data_extract['breadcrumb'] = data_extract['breadcrumb'].apply(add_uncategorized)
data_extract = data_extract.apply(lambda x: x.astype(str).str.lower())    # all chars to lowercase
data_extract.drop_duplicates(subset="product_name",keep=False, inplace=True)
#data_extract

In [None]:
# BC ORIGIN (pred_breadcrumb)
df_extract = data_extract[data_extract['breadcrumb'] == bc_origin]
df_extract

In [None]:
# BC DESTINATION (real_breadcrumb)
target_values = [bc_origin, bc_dest]
df_bc_regex = data_bc_regex[data_bc_regex['breadcrumb'].isin(target_values)]  
df_bc_regex

In [None]:
%%time
# build df with all regex_valid per product : product_name | bc_origin | {bc_dest : [(regex, count), (regex, count)] , bc_dest : [] }
df_product_regex_valid = df_extract.copy()


# buid a df with 1 column per bc_dest and empty list
df_result = df_product_regex_valid.copy()
#bc_dest_list = [k for k in df_bc_regex['breadcrumb'].unique()]
#for bc_dest in bc_dest_list :
#    df_result[bc_dest] = np.empty((len(df_result), 0)).tolist()
    
bc_dest_list = {k : np.empty((len(df_result), 0)).tolist() for k in df_bc_regex['breadcrumb'].unique()}
new_columns = pd.DataFrame(bc_dest_list)
df_result = pd.concat([df_result.reset_index(drop=True), new_columns.reset_index(drop=True)], axis=1)


# loop over the dict bc : regex valid list
for index, row in tqdm(df_bc_regex.iterrows()):
    # check if there are a regex inside the regex list
    if len(row['regex_list']) > 0 :
        # convert the regex_list as actual list
        regex_list = eval(row['regex_list'])
        for regex in regex_list :
            r = re.compile(f'.*{regex[0]}.*')
            # we build a list of products that match the regex
            match_list = list(filter(r.match, list(df_extract['product_name'])))
            if len(match_list) > 0 :
                for match in match_list :
                    # add the regex to the regex list for the right bc and for a specific product name (match)
                    m = df_result['product_name'].eq(match)
                    df_result[row['breadcrumb']] = df_result[row['breadcrumb']].mask(m, df_result[row['breadcrumb']].apply(lambda x: x + [regex]))


df_result.set_index("product_name", inplace=True)
# remove column bc_origin
df_only_bc_dest = df_result.drop(df_result.columns[[0]], axis=1)
dict_index_bc_regex = df_only_bc_dest.to_dict(orient="index")

# we add the dict bc_dest : list of valid regex to the df with product name and bc_origin
for k,v in dict_index_bc_regex.items() : 
    df_product_regex_valid.loc[df_product_regex_valid['product_name'] ==k, 'bc_regex_valid'] = [v]

df_product_regex_valid = df_product_regex_valid.rename(columns= {'breadcrumb':'bc_origin'})
df_product_regex_valid.to_csv('data/df_product_regex_valid.csv')
df_product_regex_valid

In [None]:
# check number of bc_regex per product and populate product not catch , product catch with several bc , product catch with unique bc 
df_product_not_catch = pd.DataFrame(columns=['product_name', 'bc_origin', 'bc_regex_valid'])
df_product_catch_sent_several_bc = pd.DataFrame(columns=['product_name', 'bc_origin', 'bc_regex_valid'])

df_product_catch_sent_unique_bc = pd.DataFrame(columns=['product_name', 'bc_origin', 'bc_regex_valid'])
df_bc_regex_final = pd.DataFrame(columns=['product_name', 'bc_origin', 'bc_regex_valid'])

for index, row_valid in tqdm(df_product_regex_valid.iterrows()):
    bc_regex_valid = {}
    for breadcrumb, regex_list in row_valid['bc_regex_valid'].items():
        if regex_list : 
            bc_regex_valid[breadcrumb] = regex_list

    if len(bc_regex_valid) == 0 : # if no valid regex then send to df_product_not_catch
        df_product_not_catch_newrow = pd.DataFrame({'product_name': [row_valid['product_name']], 'bc_origin': [row_valid['bc_origin']], 'bc_regex_valid': [np.nan]})
        df_product_not_catch = pd.concat([df_product_not_catch, df_product_not_catch_newrow], axis=0)

    elif len(bc_regex_valid) == 1 : # if only 1 regex valid then send to df_product_catch_sent_unique_bc
        df_product_catch_sent_unique_bc_newrow = pd.DataFrame({'product_name': [row_valid['product_name']], 'bc_origin': [row_valid['bc_origin']], 'bc_regex_valid': [bc_regex_valid]})
        df_product_catch_sent_unique_bc = pd.concat([df_product_catch_sent_unique_bc, df_product_catch_sent_unique_bc_newrow], axis=0, ignore_index=True)
    elif len(bc_regex_valid) > 1 : # if more than 1 regex valid then send to df_product_catch_sent_several_bc
        df_product_catch_sent_several_bc_newrow = pd.DataFrame({'product_name': [row_valid['product_name']], 'bc_origin': [row_valid['bc_origin']], 'bc_regex_valid': [bc_regex_valid]})
        df_product_catch_sent_several_bc = pd.concat([df_product_catch_sent_several_bc, df_product_catch_sent_several_bc_newrow], axis=0, ignore_index=True)

df_product_catch_sent_unique_bc


In [None]:
### FILTER ON BC ORIGIN ###
# Clean df_product_catch_sent_unique_bc to keep only products covered by the BR :
df_product_in_scope = pd.DataFrame(columns=['product_name', 'bc_origin', 'bc_regex_valid'])
for index, row in tqdm(df_product_catch_sent_unique_bc.iterrows()):

    df_product_in_scope_newrow = pd.DataFrame({'product_name': [row['product_name']], 'bc_origin':[row['bc_origin']], 'bc_regex_valid':[row['bc_regex_valid']]})
    # if bc_origin in scope then keep product # Precision
    if row['bc_origin'] in category_breadcrumb :
        df_product_in_scope = pd.concat([df_product_in_scope, df_product_in_scope_newrow], axis=0, ignore_index=True)
    # if bc_origin out scope & bc_dest in scope then keep product # Recall
    elif row['bc_origin'] not in category_breadcrumb and list(row['bc_regex_valid'].keys())[0] in category_breadcrumb :
        df_product_in_scope = pd.concat([df_product_in_scope, df_product_in_scope_newrow], axis=0, ignore_index=True)
    # if bc_origin out scope & bc_dest out scope then throw away product
    else : 
        continue
df_product_in_scope


In [None]:
### RE-FORMAT DF###
# Re-format a new df from df_product_in_scope with product_name , bc_dest and regex_valid_list
df_product_bc_regex = df_product_in_scope.copy()
df_product_bc_regex.insert(1,'bc_dest', np.nan)     #insert the bc_dest column
df_product_bc_regex.insert(2,'regex_valid_list', np.nan)     #insert the bc_dest column

for index, row in tqdm(df_product_bc_regex.iterrows()):
    bc_dest = list(row['bc_regex_valid'].keys())[0]
    regex_valid_list = str(list(row['bc_regex_valid'].values())[0])     # convert list to str to append to df
    df_product_bc_regex.loc[index, 'bc_dest'] = bc_dest
    df_product_bc_regex.loc[index, 'regex_valid_list'] = regex_valid_list

df_product_bc_regex = df_product_bc_regex[['product_name', 'bc_origin', 'bc_dest', 'regex_valid_list']]

### KEEP ONLY MOVEMENTS ###
# We remove all regex when bc_origin and bc_dest are the same
df_product_bc_regex.drop(df_product_bc_regex[df_product_bc_regex['bc_origin']  == df_product_bc_regex['bc_dest']].index, inplace=True)
df_product_bc_regex



In [None]:
# Transform df_product_bc_regex to bc_regex_final. We select the most common regex_valid among all products per bc.
# Build a dictionary bc_regex_valid as {bc_dest1 : [(regex,count), (regex,count)], bc_dest2 : [(regex,count), (regex,count)]}
bc_regex_valid = {}
for index, row in tqdm(df_product_bc_regex.iterrows()):
    bc_dest = row['bc_dest']
    regex_list = eval(row['regex_valid_list'])    # use eval to convert str in df to actual list
    if bc_dest not in bc_regex_valid :      # populate with the bc_regex_valid with the first bc_dest : regex_list
        bc_regex_valid.update({bc_dest : regex_list})
    else :
        updated_regex_list = bc_regex_valid[bc_dest] + regex_list     # append new regex_list for each bc_dest
        bc_regex_valid[bc_dest] = updated_regex_list


In [None]:
# count each iteration of regex per bc_dest to get bc_regex_valid_count = {bc_dest1 : [((regex,count),5), ((regex,count),3)], bc_dest2 : [((regex,count),12), ((regex,count),8)]}
bc_regex_valid_count = {}
for bc_dest, regex_list in tqdm(bc_regex_valid.items()):
    regex_list_count = Counter(regex_list)   # count how many distinct regex per bc
    regex_list_count = sorted(regex_list_count.items(), key=lambda x:x[1], reverse=True) # sort in descending order the values
    bc_regex_valid_count.update({bc_dest : regex_list_count})


In [None]:
# loop over each regex starting with largest count and check all products catch. Delete the last regex as we don't need anymore
bc_regex_final = {k: [] for k in df_product_bc_regex['bc_dest'].unique()}
for index, row in tqdm(df_product_bc_regex.iterrows()):
    #df_same_bc_dest = df_product_bc_regex.loc[df_product_bc_regex['bc_dest'] == row['bc_dest']]
    regex_list = bc_regex_valid_count[row['bc_dest']]

    for regex in regex_list :
        if re.search(regex[0][0], row['product_name']):
            bc_regex_final[row['bc_dest']] += [regex[0][0]]     # append the first regex coming (previously sorted in descending order)
            bc_regex_final[row['bc_dest']] = list(set(bc_regex_final[row['bc_dest']]))  # remove if duplicates
            break

# FORMAT BUSINESS RULE

# separate breadcrumb in and out of the scope categ_0
category_0_scope = []
for breadcrumb in category_breadcrumb : 
    category_0_scope.append(breadcrumb.split("_")[0])
    category_0_scope = list(set(category_0_scope))  # remove if duplicates

# populate dict for 3 options : bc with regex for diff categ0 , same categ0 but diff scope , same categ0 same scope
bc_regex_diff_categ0 = {}
bc_regex_same_categ0_diff_scope = {}
bc_regex_same_categ0_same_scope = {}
for breadcrumb_destination, regex_list in tqdm(bc_regex_final.items()):
    categ0 = breadcrumb_destination.split("_")[0]

    if categ0 not in category_0_scope and categ0 not in bc_regex_diff_categ0 :
        bc_regex_diff_categ0[categ0] = regex_list

    elif categ0 not in category_0_scope and categ0 in bc_regex_diff_categ0 :
        bc_regex_diff_categ0[categ0] += regex_list

    elif categ0 in category_0_scope and breadcrumb_destination not in category_breadcrumb :
        bc_regex_same_categ0_diff_scope[breadcrumb_destination] = regex_list
  
    elif categ0 in category_0_scope and breadcrumb_destination in category_breadcrumb :
        bc_regex_same_categ0_same_scope[breadcrumb_destination] = regex_list

In [None]:
### REFORMAT THE REGEX LIST WITH from "x.*y|x.*z" to "x.*(y|z)""###
def format_regex_list(regex_list) :
    # select only regex with .* for the function
    regex_list_with_ast = []
    regex_list_no_ast = []
    for regex in regex_list :
        if '.*' in regex:
            regex_list_with_ast.append(regex)
        else :
            regex_list_no_ast.append(regex)
    
    first_chars = {}
    second_chars = {}
    # create 2 dict with first and second chars as keys and empty list as value
    for regex in regex_list_with_ast:
        first_char = regex.split('.*', 1)[0]
        first_chars[first_char]=[]

        second_char = regex.split('.*', 1)[1]
        second_chars[second_char]=[]


    # append to empty lists the second chars
    for regex in regex_list_with_ast:
        first_char = regex.split('.*', 1)[0]
        second_char = regex.split('.*', 1)[1]

        first_chars[first_char].append(second_char)
        second_chars[second_char].append(first_char)

    # append accrodingly to number of second chars taken
    formated_regex_list = []
    for regex in regex_list_with_ast :
        first_char = regex.split('.*', 1)[0]
        second_char = regex.split('.*', 1)[1]

        if len(first_chars[first_char]) >= 2 :
            formated_regex_list.append(first_char + ".*(" + "|".join(first_chars[first_char]) + ")")
        elif len(second_chars[second_char]) >= 2 :
            formated_regex_list.append( "(" + "|".join(second_chars[second_char]) + ").*" + second_char)  
        else :
            formated_regex_list.append(regex)

    formated_regex_list = list(dict.fromkeys(formated_regex_list))
    return(formated_regex_list + regex_list_no_ast)
    
diff_categ0 = {}
for bc, regex_list in bc_regex_diff_categ0.items() :
    diff_categ0[bc] = format_regex_list(regex_list)

bc_regex_diff_categ0 = diff_categ0


same_categ0_diff_scope = {}
for bc, regex_list in bc_regex_same_categ0_diff_scope.items() :
    same_categ0_diff_scope[bc] = format_regex_list(regex_list)

bc_regex_same_categ0_diff_scope = same_categ0_diff_scope


same_categ0_same_scope = {}
for bc, regex_list in bc_regex_same_categ0_same_scope.items() :
    same_categ0_same_scope[bc] = format_regex_list(regex_list)

bc_regex_same_categ0_same_scope = same_categ0_same_scope

In [None]:
########### INFO ###########
print('########### BREADCRUMBS IN OR OUT OUR REGEX MODEL ###########\n')
# dispatch breadcrumbs if in our out our regex model
bc_in = {}
bc_in_but_no_regex = {}
bc_no = []
for breadcrumb in df_extract['breadcrumb'].unique() :
    if (breadcrumb in list(df_bc_regex['breadcrumb'].unique()) and len(eval(df_bc_regex['regex_list'].loc[df_bc_regex['breadcrumb']==breadcrumb].values[0])) > 0) :
        bc_in[breadcrumb] = len(eval(df_bc_regex['regex_list'].loc[df_bc_regex['breadcrumb']==breadcrumb].values[0]))
    elif (breadcrumb in list(df_bc_regex['breadcrumb'].unique()) and len(eval(df_bc_regex['regex_list'].loc[df_bc_regex['breadcrumb']==breadcrumb].values[0])) == 0) :
        bc_in_but_no_regex[breadcrumb] = len(eval(df_bc_regex['regex_list'].loc[df_bc_regex['breadcrumb']==breadcrumb].values[0]))
    else :
        bc_no.append(breadcrumb)

print('Breadcrumbs IN our regex model')
for breadcrumb, regex_count in bc_in.items():
    print(f"-- {breadcrumb} -- is in the regex model with {regex_count} regex combination")
print('\nBreadcrumbs in our regex model but has no regex :')
for breadcrumb, regex_count in bc_in_but_no_regex.items():
    print(f"-- {breadcrumb} -- is in the regex model but with {regex_count} regex combination")
print('\nBreadcrumbs OUT our regex model :')
for breadcrumb in bc_no:
    print(f"-- {breadcrumb} -- is not in the regex model")


print('\n########### PRODUCTS SUMMARY PER BREADCRUMB ###########')
# count number of products per breadcrumb
# df_product_catch_sent_unique_bc # df_product_catch_sent_several_bc # df_product_not_catch 
total_products_sent_one_bc = 0
total_products_sent_several_bc = 0
total_products_not_catch = 0

for breadcrumb in category_breadcrumb:
    total_products_sent_one_bc += len(df_product_catch_sent_unique_bc.loc[df_product_catch_sent_unique_bc["bc_origin"]==breadcrumb])
    total_products_sent_several_bc += len(df_product_catch_sent_several_bc.loc[df_product_catch_sent_several_bc["bc_origin"]==breadcrumb])
    total_products_not_catch += len(df_product_not_catch.loc[df_product_not_catch["bc_origin"]==breadcrumb])
    
    print(f'\n{breadcrumb} -->')
    print(f'Number of products catch and sent to one breadcrumb - Handled in the BR  : {len(df_product_catch_sent_unique_bc.loc[df_product_catch_sent_unique_bc["bc_origin"]==breadcrumb])}')
    print(f'Number of products catch and sent to several breadcrumb - Not handled in the BR : {len(df_product_catch_sent_several_bc.loc[df_product_catch_sent_several_bc["bc_origin"]==breadcrumb])}')
    print(f'Number of products not catch - Not handled in the BR : {len(df_product_not_catch.loc[df_product_not_catch["bc_origin"]==breadcrumb])}')

print(f'\nTOTAL movements -->')    
print(f'TOTAL of products catch and sent to one breadcrumb - Handled in the BR  : {total_products_sent_one_bc}')
print(f'TOTAL of products catch and sent to several breadcrumb - Not handled in the BR : {total_products_sent_several_bc}')
print(f'TOTAL of products not catch - Not handled in the BR : {total_products_not_catch}')


########### BR FORMAT ###########
print('\n########### BR FORMAT ###########\n')
print('CASE\n')
# loop over different bc_regex and build template for business rule
print('-- PROJECTIONS FOR DIFFERENT CATEG0')
for categ0, regex_list in bc_regex_diff_categ0.items():
    if regex_list :
        regex = '|'.join(regex_list)
        print(f'''-- {categ0}_uncategorized_uncategorized_uncategorized
WHEN REGEXP_CONTAINS(product_name_description, r"{regex}")
  THEN "{categ0}_uncategorized_uncategorized_uncategorized" \n''')
    
print('\n-- PROJECTIONS FOR SAME CATEG0 BUT DIFFERENT SCOPE')
for breadcrumb, regex_list in bc_regex_same_categ0_diff_scope.items():
    if regex_list :
        regex = '|'.join(regex_list)
        print(f'''-- {breadcrumb}
WHEN REGEXP_CONTAINS(product_name_description, r"{regex}")
  THEN "{breadcrumb}" \n''')
    
print('\n-- PROJECTIONS FOR SAME CATEG0 AND SAME SCOPE')
for breadcrumb, regex_list in bc_regex_same_categ0_same_scope.items():
    if regex_list :
        regex = '|'.join(regex_list)
        print(f'''-- {breadcrumb}
WHEN REGEXP_CONTAINS(product_name_description, r"{regex}")
  THEN "{breadcrumb}" \n''')
        
print('''\n\nELSE NULL
END''')   

        

In [None]:
"""
## INIT THE LOOP 
#df_volume_checks = pd.DataFrame(columns=['perc_products_sent_one_bc', 'perc_products_sent_several_bc', 'perc_products_not_catch', 'perc_max_prod_catch_from_other_bc'])


# CHECK VOLUMES IN LOOP
total_products = total_products_sent_one_bc + total_products_sent_several_bc + total_products_not_catch

perc_products_sent_one_bc = total_products_sent_one_bc / total_products
perc_products_sent_several_bc = total_products_sent_several_bc / total_products
perc_products_not_catch = total_products_not_catch / total_products

df_newrow = pd.DataFrame({'perc_products_sent_one_bc': [perc_products_sent_one_bc],
                    'perc_products_sent_several_bc': [perc_products_sent_several_bc],
                         'perc_products_not_catch': [perc_products_not_catch],
                         'perc_max_prod_catch_from_other_bc' : [perc_max_prod_catch_from_other_bc]})


df_volume_checks=pd.concat([df_volume_checks,df_newrow], axis=0)

df_volume_checks.to_csv(df_volume_checks_path)

df_volume_checks
"""

In [None]:
len(df_extract)

In [None]:
import matplotlib.pyplot as plt
# df_volume_checks_top100_r1
df1 =  pd.read_csv('data/df_volume_checks_top100_r1.csv')
plt.rcParams["figure.figsize"] = (10,5)
plt.plot(df1['perc_max_prod_catch_from_other_bc']*100, df1['perc_products_sent_one_bc']/len(df_extract)*100, color='green', marker='.', label="Products sent to one bc r1 Top 100")
plt.plot(df1['perc_max_prod_catch_from_other_bc']*100, df1['perc_products_sent_several_bc']/len(df_extract)*100, color='green', marker='x', label="Products sent to several bc r1 Top 100")

df2 =  pd.read_csv('data/df_volume_checks_top100_r2.csv')
plt.plot(df2['perc_max_prod_catch_from_other_bc']*100, df2['perc_products_sent_one_bc']/len(df_extract)*100, color='orange', marker='.', label="Products sent to one bc r2 Top 100")
plt.plot(df2['perc_max_prod_catch_from_other_bc']*100, df2['perc_products_sent_several_bc']/len(df_extract)*100, color='orange', marker='x', label="Products sent to several bc r2 Top 100")

df3 =  pd.read_csv('data/df_volume_checks_full_loop_r3.csv')
plt.plot(df3['perc_max_prod_catch_from_other_bc']*100, df3['perc_products_sent_one_bc']/len(df_extract)*100, color='blue', marker='.', label="Products sent to one bc r2 ")
plt.plot(df3['perc_max_prod_catch_from_other_bc']*100, df3['perc_products_sent_several_bc']/len(df_extract)*100, color='blue', marker='x', label="Products sent to several bc r2 ")



plt.title('Regex model precision VS Volume moved by the BR', fontsize=20)
plt.xlabel('Percentage of wrong products caught by a regex (%)', fontsize=15)
plt.ylabel('Volume of products moved by the BR (%)', fontsize=15)

#plt.ylim(2000, 3000)

plt.legend()
plt.show()



In [None]:
df