#### Initial configurations

In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv('inputs/data_storeproduct_202208111119.csv', usecols = ['id', 'name'])

In [3]:
df.shape

(820512, 2)

In [4]:
df.drop_duplicates(subset = 'name', inplace = True)

In [5]:
df.shape

(306052, 2)

In [6]:
df['name'] = df['name'].str.upper()

In [7]:
df['clean'] = df['name']

---

### Removes initial characters

In [8]:
def clean_initial_characters(name):
    chars = ['!', '%', '*', '+', ',', '-', '.', '/', '<', '=']

    try:
        for word in name.split()[0]:
            for char_ in word:
                if char_ in chars:
                    name = name.replace(char_, '')
    except:
        name = name
    
    return name    

In [9]:
df['clean'] = df['clean'].apply(lambda x: clean_initial_characters(x))

In [10]:
# # Tested Items
# df.loc[[481395, 295715, 755477, 178663, 484751, 239320, 28118, 280929, 170862, 106470]]

---

### Removes initial digits

In [11]:
def clean_initial_digits(name):
    try:
        first_word = name.split()[0]

        if first_word[0].isdigit():
            for char_ in first_word:
                if char_.isdigit():
                    name = name.replace(char_, '')
    except:
        name = ''

    return name

In [12]:
df['clean'] = df['clean'].apply(lambda x: clean_initial_digits(x))

In [13]:
df['clean'] = df['clean'].apply(lambda x: clean_initial_characters(x))

In [14]:
# # Tested Items
# df.loc[[743024,638551, 682945, 545029, 337465, 743128, 752302, 337317, 743028]]

---

### Removes parentheses

In [15]:
df['clean'] = df['clean'].apply(lambda x: re.sub(r'\([^()]*\)', '', x))

In [16]:
# # Tested Items
# df.loc[[743024, 270107, 210205, 295409, 295406, 764575, 208980, 205859, 764574, 796994, 744286]]

---

### Removes units

In [34]:
def clean_units(name):
    units = [
        'U', 'G', 'V', 'P', 'L', 'M', 
        'GR', 'KG', 'ML', 'UN', 'CM', 'MG', 'CP', 'MM', 'CM', 'PC', 
        'CPS', 'CPR', 'VOL', 'CAP', 'UND', 'GRS', 
        'CAPS', 'UNID', 'UNDS', 'CPRS', 'COMP', 
        'UNID.', 'SACHES' 
    ]
    to_remove = []
    words = name.split()

    for word in words:
        for unit in units:
            if word == unit:
                to_remove.append(word)
                
                try:
                    previous = words[words.index(word) - 1] 
                
                    # VERIFICAR SE A MAIORIA DO ANTERIOR É DIGITO
                    # CASO SEJA, REMOVER
                    if previous.isdigit():
                        to_remove.append(previous)
                except:
                    pass
                
                break
            if word[-len(unit):] == unit and word[:-len(unit)].isdigit():
                to_remove.append(word)
                break

    return ' '.join([word for word in words if word not in to_remove])

In [18]:
df['clean'] = df['clean'].apply(lambda x: clean_units(x))

In [35]:
# Tested Items
df.loc[[
    363199, 635298, 40689,  740467, 213141, 679374, 288780, 
    810939, 510231, 648258, 704156, 178001, 413533, 785982, 
    340359, 211457, 565330, 739959, 491207, 598294, 212840, 
    585916, 575283, 786004, 545491, 646295, 734862, 353142, 51607,
    624800, 256973, 713142, 536849, 165346, 214062, 685358, 
    520570, 120444, 685071, 712342, 484879, 685358, 765239, 
    658096, 729729, 545046, 507365, 444408, 29568,  168168, 
    220269, 238971, 487450, 726061, 696019, 122934, 649598, 350363
]]

Unnamed: 0,id,name,clean,had_changes,num_words
363199,297978,ALBOCRESIL GEL C/ APL 50G,ALBOCRESIL GEL,True,2
635298,414472,ABS INT DAYS SU ANTIBAC 40U,ABS INT DAYS SU ANTIBAC,True,5
40689,25820,ABAJUR MINI BARBIE 240V,ABAJUR MINI BARBIE,True,3
740467,329789,UNITRAM 15 MG 30P,UNITRAM,True,1
213141,63541,"CHOPP HEINEKEN 5,00L","CHOPP HEINEKEN 5,00L",False,3
679374,302413,ACETONA DONNA 100M,ACETONA DONNA,True,2
288780,399255,ACICLOVIR CR 10GR,ACICLOVIR CR,True,2
810939,451453,ACUCAR CRISTAL FORTUNA 5KG,ACUCAR CRISTAL FORTUNA,True,3
510231,468584,A CURITYBINA 5ML,A CURITYBINA,True,2
648258,367041,ABS ALWAYS BASICO SECA C/A 16UN,ABS ALWAYS BASICO SECA,True,4


---

### Removes abbreviations

In [21]:
def clean_abbrs(name):
    abbrs = ['REF', 'REF.', 'C/', 'S/', 'P/', 'FPS']
    to_remove = []
    words = name.split()

    for word in words:
        if word in abbrs:
            to_remove.append(word)
            try:
                to_remove.append(words[words.index(word) + 1])
            except:
                pass
        elif word.startswith(tuple(abbrs)):
            for abbr in abbrs:
                if abbr == 'REF' == word[:3] and word[3] != '.' and not word[len(abbr):][0].isdigit():
                    break
                else:
                    to_remove.append(word)

    return ' '.join([word for word in words if word not in to_remove])

In [22]:
df['clean'] = df['clean'].apply(lambda x: clean_abbrs(x))

In [23]:
# Tested Items
df.loc[[
    507889, 308443, 814522, 708421, 800889, 773057, 671045, 583425, 582548, 700178, 
    318441, 318442, 214566, 632491, 480583, 733038, 407674, 658417, 310619, 648318
]]

Unnamed: 0,id,name,clean
507889,217577,"AGULHA 1,20X40 MM REF 07455","AGULHA 1,20X40"
308443,3802,BUCHA VEGETAL OVAL NINA REF3020,BUCHA VEGETAL OVAL NINA
814522,1019445,CAMINHAO INVICTUS BASCULANTE REF. 1042,CAMINHAO INVICTUS BASCULANTE
708421,244806,BUCHA VEGETAL ZALIKE JUNIOR REF.325,BUCHA VEGETAL ZALIKE JUNIOR
800889,425125,CAMOMINE C BABY C/ 20 CP,CAMOMINE C BABY
773057,421983,CAMOMINE C BABY C/20 CAPS,CAMOMINE C BABY
671045,52374,CERAVE GEL DE LIMPEZA S/ PERF 454ML,CERAVE GEL DE LIMPEZA
583425,439330,CENTRO ACTIVE SENIOR S/SABOR,CENTRO ACTIVE SENIOR
582548,481998,CENOURA & BRONZE FPS 15,CENOURA & BRONZE
700178,349386,CAPITAL SOLEIL FPS30 T SEC 40G,CAPITAL SOLEIL T SEC


---

### Creates a CSV file with cleaned names

In [24]:
df['clean'] = df['clean'].str.strip()

In [25]:
df['had_changes'] = df['clean'] != df['name']

In [26]:
df['num_words'] = df['clean'].apply(lambda x: len(str(x).split()))

In [27]:
df.sort_values(by = 'clean', inplace = True)

In [28]:
df.to_csv('outputs/clean.csv', index = False)

---

### Metrics

In [29]:
print(f"Items\n{'-' * len('Items')}")
print(f"Total: {df.shape[0]}")
print(f"Changed: {len(df[df['had_changes'] == True])}")
print(f"Not Changed: {len(df[df['had_changes'] == False])}")

# TOTAL     CHANGED     NOT CHANGED
# 306.052   36.829      269.223
# 306.052   91.447      214.605
# 306.052   232.885     73.167

Items
-----
Total: 306052
Changed: 228912
Not Changed: 77140


In [30]:
print(f"Unique Items\n{'-' * len('Unique Items')}")
print(f"Original dataset: {df['name'].nunique()}")
print(f"Cleaned dataset: {df['clean'].nunique()}")
print(f"Unified items: {df['name'].nunique() - df['clean'].nunique()}")

# ORIGINAL      CLEANED     UNIFIED
# 306.051       303.710     2.341
# 306.051       297.114     8.937
# 306.051       259.696     46.355

Unique Items
------------
Original dataset: 306051
Cleaned dataset: 261375
Unified items: 44676


---

In [31]:
# df[df['id'].isin([69967, 295119, 897800, 383978, 485193, 315322])]

---

In [32]:
# Unique
# unique_ = ['COMP', 'CAPS', 'CAPS.', 'CX', 'CX.', 'INJ', 'UN', 'ML', 'PET', 'C', 'GFA']

In [33]:
# dict_ = {
#     'TX': 'TAXA',
#     'ABS': 'ABSORVENTE',
#     'ESP': 'ESPARADRAPO',
#     'SAB': 'SABONETE',
#     'DES': 'DESODORANTE',
#     'ESC': 'ESCOVA',
#     'LOC': 'LOCAO',
#     'PERF': 'PERFUME',
#     'TRAD': 'TRADICIONAL',
#     'CURAT': 'CURATIVO',
# }