In [70]:
import pandas as pd

In [71]:
df = pd.read_csv('inputs/data_storeproduct_202208111119.csv', usecols = ['id', 'name'])

In [72]:
df.shape

(820512, 2)

In [73]:
df.drop_duplicates(subset = 'name', inplace = True)

In [74]:
df.shape

(306052, 2)

In [75]:
df['name'] = df['name'].str.upper()

In [76]:
df['clean'] = df['name']

In [78]:
df['clean'] = \
    df['clean'].apply(
        lambda words: 
            ' '.join(
            [word for word in words.split() if not (word.startswith('(') and word.endswith(')'))]
        )
    )    

In [79]:
def remove_pairs(name, pattern):
    words = name.split()

    for k, v in enumerate(words):
        if v == pattern:
            try:
                words.remove(words[k + 1])
                words.remove(v)
            except:
                pass
                
    return ' '.join(words)

In [80]:
patterns = ['REF', 'FPS', 'C/', 'S/', 'CP']

In [81]:
for i in patterns:
    df['clean'] = df['clean'].apply(lambda x: remove_pairs(x, i))

In [82]:
def remove_metrics(name):
    to_remove = []
    
    for word in name.split():
        if word[0].isdigit() and 'X' in word and word[-1] == 'M' :
            to_remove.append(word)

    if len(to_remove) > 0:
        return ''. join([name.replace(x, '') for x in to_remove])
    
    return name

In [83]:
df['clean'] = df['clean'].apply(lambda x: remove_metrics(x))

In [84]:
def remove_startswith(pattern):
    return lambda words: ' '.join([word for word in words.split() if not word.startswith(pattern)])

In [85]:
patterns = ['C/', 'S/', 'FPS']

In [86]:
for i in patterns:
    df['clean'] = df['clean'].apply(remove_startswith(i))

In [99]:
def remove_endswith(pattern):
    return lambda words: words.replace(''.join([x for x in words.split() if x.endswith(pattern) and x[:-len(pattern)].isdigit()]), '')

In [100]:
endswith_ = ['G', 'L', 'ML', 'MG' , 'UN', 'GR', 'CP', 'CPS', 'VOL', 'CAP', 'UNID', 'UNID.', 'KG']

In [101]:
for i in endswith_:
    df['clean'] = df['clean'].apply(remove_endswith(i))

In [103]:
df['clean'] = df['clean'].str.strip()

In [104]:
df['had_changes'] = df['clean'] != df['name']

In [105]:
df['num_words'] = df['clean'].apply(lambda x: len(str(x).split()))

In [93]:
df.to_csv('outputs/clean.csv', index = False)

In [107]:
print(f"Total items: {df.shape[0]}")
print(f"Items with changes: {len(df[df['had_changes'] == True])}")
print(f"Items without changes: {len(df[df['had_changes'] == False])}")

Total items: 306052
Items with changes: 193969
Items without changes: 112083


In [108]:
df['clean'].nunique()

281252

In [106]:
df.loc[[
    10, 1371, 1526, 177283, 2252, # remove_pairs()
    3542, 210239, # remove_metrics() 
    12, 790, 2018, 208178, # remove_startswith()
    8, 65, 3810, 4262, 29601, 54775, 210436, 211906, 213106, 816883, 34, # remove_endswith()
    820265, # lambda x: x.startswith('(') and x.endswith('(') 
]]

Unnamed: 0,id,name,clean,had_changes,num_words
10,458515,BITUFO - PASSA FIO WAY C/ 30,BITUFO - PASSA FIO WAY,True,5
1371,355721,NOPLAK S/ ALCOOL 250ML,NOPLAK,True,1
1526,356920,KIT MAMADEIRA COLORS BICO RED REF 926 (CEFISA),KIT MAMADEIRA COLORS BICO RED,True,5
177283,56645,AVENE EMULSAO SOLAR TS FPS 70 40G,AVENE EMULSAO SOLAR TS,True,4
2252,376083,COD PAR 12CP,COD PAR,True,2
3542,460190,"ESP MIC CIEX 2,5CMX4,5M",ESP MIC CIEX,True,3
210239,268953,"ESPARADRAPO 25MMX4,5MM NEXCARE",ESPARADRAPO NEXCARE,True,2
12,458516,ISABABY - HAST FLEX BABY C/50,ISABABY - HAST FLEX BABY,True,5
790,356086,ABS INTIMUS NOT SUAVE C/ABAS C/16 UN,ABS INTIMUS NOT SUAVE UN,True,5
2018,357133,TENYS PE BARUEL AEROSOL S/PERFUME 110G,TENYS PE BARUEL AEROSOL,True,4


---

In [97]:
df[df['id'] == 459312]

Unnamed: 0,id,name,clean,had_changes,num_words
34,459312,INTEGRALMEDICA - SINISTER MASS POUCH 3KG VITAM...,INTEGRALMEDICA - SINISTER MASS POUCH 3KG VITAM...,False,9


In [None]:
# df[
#     df['name'].str.endswith('CP') 
#     # & df['name'].str.endswith(')')
# ]

In [None]:
# Tratar:
# METRONIDAZOL  GEL VAG BG 50G+10APLIC (NOVA QUIMICA)

In [96]:
# Unique
# unique_ = ['COMP', 'CAPS', 'CAPS.', 'CX', 'CX.', 'INJ', 'UN', 'ML', 'PET', 'C', 'GFA']

In [None]:
# dict_ = {
#     'TX': 'TAXA',
#     'ABS': 'ABSORVENTE',
#     'ESP': 'ESPARADRAPO',
#     'SAB': 'SABONETE',
#     'DES': 'DESODORANTE',
#     'ESC': 'ESCOVA',
#     'LOC': 'LOCAO',
#     'TRAD': 'TRADICIONAL',
#     'CURAT': 'CURATIVO',
# }