In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv('inputs/data_storeproduct_202208111119.csv', usecols = ['id', 'name'])

In [3]:
df.shape

(820512, 2)

In [4]:
df.drop_duplicates(subset = 'name', inplace = True)

In [5]:
df.shape

(306052, 2)

In [6]:
df['name'] = df['name'].str.upper()

In [7]:
df['clean'] = df['name']

---

### Removes specific characters at the beginning of product names

In [8]:
def clean_initial_characters(name):
    chars = ['!', '%', '*', '+', ',', '-', '.', '/', '<', '=']

    try:
        for word in name.split()[0]:
            for char_ in word:
                if char_ in chars:
                    name = name.replace(char_, '')
    except:
        name = name
    
    return name    

In [9]:
df['clean'] = df['clean'].apply(lambda x: clean_initial_characters(x))

In [10]:
# # Tested Items
# df.loc[[481395, 295715, 755477, 178663, 484751, 239320, 28118, 280929, 170862, 106470]]

---

### Removes parentheses

In [11]:
df['clean'] = df['clean'].apply(lambda x: re.sub(r'\([^()]*\)', '', x))

In [12]:
# # Tested Items
# df.loc[[743024, 270107, 210205, 295409, 295406, 764575, 208980, 205859, 764574, 796994, 744286]]

---

### Removes digits at the beginning of product names

In [13]:
def clean_initial_digits(name):
    try:
        first_word = name.split()[0]

        if first_word[0].isdigit():
            for char_ in first_word:
                if char_.isdigit():
                    name = name.replace(char_, '')
    except:
        name = ''

    return name

In [14]:
df['clean'] = df['clean'].apply(lambda x: clean_initial_digits(x))

In [15]:
df['clean'] = df['clean'].apply(lambda x: clean_initial_characters(x))

In [16]:
# # Tested Items
# df.loc[[743024,638551, 682945, 545029, 337465, 743128, 752302, 337317, 743028]]

---

### Removes abbreviations

In [17]:
def clean_abbrs(name):
    abbrs = ['REF', 'REF.', 'C/', 'S/', 'P/', 'FPS']
    to_remove = []
    words = name.split()

    for word in words:
        if word in abbrs:
            to_remove.append(word)
            try:
                to_remove.append(words[words.index(word) + 1])
            except:
                pass
        elif word.startswith(tuple(abbrs)):
            for abbr in abbrs:
                if word.startswith(abbr) and word[len(abbr):][0].isdigit():
                    to_remove.append(word)


    return ' '.join([word for word in words if word not in to_remove])

In [18]:
df['clean'] = df['clean'].apply(lambda x: clean_abbrs(x))

In [19]:
# Tested Items
df.loc[[
    507889, 308443, 814522, 708421, 800889, 773057, 671045, 583425, 582548, 
    700178, 318441, 318442, 214566, 632491, 480583, 733038, 407674, 658417
]]

Unnamed: 0,id,name,clean
507889,217577,"AGULHA 1,20X40 MM REF 07455","AGULHA 1,20X40 MM"
308443,3802,BUCHA VEGETAL OVAL NINA REF3020,BUCHA VEGETAL OVAL NINA
814522,1019445,CAMINHAO INVICTUS BASCULANTE REF. 1042,CAMINHAO INVICTUS BASCULANTE
708421,244806,BUCHA VEGETAL ZALIKE JUNIOR REF.325,BUCHA VEGETAL ZALIKE JUNIOR
800889,425125,CAMOMINE C BABY C/ 20 CP,CAMOMINE C BABY CP
773057,421983,CAMOMINE C BABY C/20 CAPS,CAMOMINE C BABY CAPS
671045,52374,CERAVE GEL DE LIMPEZA S/ PERF 454ML,CERAVE GEL DE LIMPEZA 454ML
583425,439330,CENTRO ACTIVE SENIOR S/SABOR,CENTRO ACTIVE SENIOR S/SABOR
582548,481998,CENOURA & BRONZE FPS 15,CENOURA & BRONZE
700178,349386,CAPITAL SOLEIL FPS30 T SEC 40G,CAPITAL SOLEIL T SEC 40G


---

### Creates a CSV file with cleaned names

In [20]:
df['clean'] = df['clean'].str.strip()

In [21]:
df['had_changes'] = df['clean'] != df['name']

In [22]:
df['num_words'] = df['clean'].apply(lambda x: len(str(x).split()))

In [23]:
df.sort_values(by = 'clean', inplace = True)

In [24]:
df.to_csv('outputs/clean.csv', index = False)

---

### Metrics

In [25]:
print(f"Items\n{'-' * len('Items')}")
print(f"Total: {df.shape[0]}")
print(f"Changed: {len(df[df['had_changes'] == True])}")
print(f"Not Changed: {len(df[df['had_changes'] == False])}")

# TOTAL     CHANGED     NOT CHANGED
# 306.052   36.829      269.223
# 306.052   87.037      219.015

Items
-----
Total: 306052
Changed: 87037
Not Changed: 219015


In [26]:
print(f"Unique Items\n{'-' * len('Unique Items')}")
print(f"Original dataset: {df['name'].nunique()}")
print(f"Cleaned dataset: {df['clean'].nunique()}")
print(f"Unified items: {df['name'].nunique() - df['clean'].nunique()}")

# ORIGINAL      CLEANED     UNIFIED
# 306.051       303.710     2.341
# 306.051       297.616     8.435

Unique Items
------------
Original dataset: 306051
Cleaned dataset: 297616
Unified items: 8435


---

In [27]:
# df[df['id'].isin([256044, 470718, 392836, 303631, 956111, 333145])]

Unnamed: 0,id,name,clean,had_changes,num_words
632491,303631,REFIL 250ML DIFUSOR BREEZE,REFIL 250ML DIFUSOR BREEZE,False,4
480583,470718,REFORGAN 20CP,REFORGAN 20CP,False,2
733038,392836,REFORMTABS 30 CP A ZAMBON .,REFORMTABS 30 CP A ZAMBON .,False,6
407674,956111,REFRESH 30 FLACONETES,REFRESH 30 FLACONETES,False,3
658417,333145,REFRIGERANTE GUARANA 350ML,REFRIGERANTE GUARANA 350ML,False,3
42054,256044,REFRIG.PEPSI 200ML,REFRIGPEPSI 200ML,True,2


In [28]:
# def remove_metrics(name):
#     to_remove = []
    
#     for word in name.split():
#         if word[0].isdigit() and 'X' in word and word[-1] == 'M':
#             to_remove.append(word)

#     if len(to_remove) > 0:
#         return ''. join([name.replace(x, '') for x in to_remove])
    
#     return name

In [29]:
# df['clean'] = df['clean'].apply(lambda x: remove_metrics(x))

In [30]:
# def remove_endswith(pattern):
#     return lambda words: words.replace(''.join([x for x in words.split() if x.endswith(pattern) and x[:-len(pattern)].isdigit()]), '')

In [31]:
# endswith_ = ['G', 'L', 'ML', 'MG' , 'UN', 'GR', 'CP', 'CPS', 'VOL', 'CAP', 'UNID', 'UNID.', 'KG']

In [32]:
# for i in endswith_:
#     df['clean'] = df['clean'].apply(remove_endswith(i))

---

In [33]:
# Unique
# unique_ = ['COMP', 'CAPS', 'CAPS.', 'CX', 'CX.', 'INJ', 'UN', 'ML', 'PET', 'C', 'GFA']

In [34]:
# dict_ = {
#     'TX': 'TAXA',
#     'ABS': 'ABSORVENTE',
#     'ESP': 'ESPARADRAPO',
#     'SAB': 'SABONETE',
#     'DES': 'DESODORANTE',
#     'ESC': 'ESCOVA',
#     'LOC': 'LOCAO',
#     'PERF': 'PERFUME',
#     'TRAD': 'TRADICIONAL',
#     'CURAT': 'CURATIVO',
# }