In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv('inputs/data_storeproduct_202208111119.csv', usecols = ['id', 'name'])

In [3]:
df.shape

(820512, 2)

In [4]:
df.drop_duplicates(subset = 'name', inplace = True)

In [5]:
df.shape

(306052, 2)

In [6]:
df['name'] = df['name'].str.upper()

In [7]:
df['clean'] = df['name']

---

### Removes specific characters at the beginning of product names

In [8]:
chars = ['!', '%', '*', '+', ',', '-', '.']

In [9]:
def clean(name):
    for word in name.split()[0]:
        for letter in word:
            if letter in chars:
                name = name.replace(letter, '')

    return name

In [10]:
df['clean'] = df['clean'].apply(lambda x: clean(x))

In [39]:
# # Tested Items
# df.loc[[481395, 295715, 755477, 178663, 484751, 239320, 28118, 280929, 170862]]

---

### Removes product names starting with parentheses

In [12]:
df['clean'] = df['clean'].apply(lambda x: re.sub(r'\([^()]*\)', '', x))

In [40]:
# # Tested Items
# df.loc[[743024, 270107, 210205, 295409, 295406, 764575, 208980, 205859, 764574, 796994, 744286]]

---

### Creates a CSV file with cleaned names

In [14]:
df['clean'] = df['clean'].str.strip()

In [15]:
df['had_changes'] = df['clean'] != df['name']

In [16]:
df['num_words'] = df['clean'].apply(lambda x: len(str(x).split()))

In [17]:
df.sort_values(by = 'clean', inplace = True)

In [18]:
df.to_csv('outputs/clean.csv', index = False)

---

### Metrics

In [19]:
print(f"Items\n{'-' * len('Items')}")
print(f"Total: {df.shape[0]}")
print(f"With changes: {len(df[df['had_changes'] == True])}")
print(f"Without changes: {len(df[df['had_changes'] == False])}")

Items
-----
Total: 306052
With changes: 36132
Without changes: 269920


In [20]:
print(f"Unique Items\n{'-' * len('Unique Items')}")
print(f"Original dataset: {df['name'].nunique()}")
print(f"Cleaned dataset: {df['clean'].nunique()}")
print(f"Unified items: {df['name'].nunique() - df['clean'].nunique()}")

Unique Items
------------
Original dataset: 306051
Cleaned dataset: 303880
Unified items: 2171


---

In [21]:
# df[df['id'].isin([408771, 269043, 358945, 444940, 194221, 305712, 194611, 194209, 319551, 193344, 194230])]

In [22]:
# df['clean'] = \
#     df['clean'].apply(
#         lambda words: 
#             ' '.join(
#             [word for word in words.split() if not (word.startswith('(') or word.endswith(')'))]
#         )
#     )

In [23]:
# def remove_pairs(name, pattern):
#     words = name.split()

#     for k, v in enumerate(words):
#         if v == pattern:
#             try:
#                 words.remove(words[k + 1])
#                 words.remove(v)
#             except:
#                 pass
                
#     return ' '.join(words)

In [24]:
# patterns = ['REF', 'FPS', 'C/', 'S/', 'CP']

In [25]:
# for i in patterns:
#     df['clean'] = df['clean'].apply(lambda x: remove_pairs(x, i))

In [26]:
# def remove_metrics(name):
#     to_remove = []
    
#     for word in name.split():
#         if word[0].isdigit() and 'X' in word and word[-1] == 'M' :
#             to_remove.append(word)

#     if len(to_remove) > 0:
#         return ''. join([name.replace(x, '') for x in to_remove])
    
#     return name

In [27]:
# df['clean'] = df['clean'].apply(lambda x: remove_metrics(x))

In [28]:
# def remove_startswith(pattern):
#     return lambda words: ' '.join([word for word in words.split() if not word.startswith(pattern)])

In [29]:
# patterns = ['C/', 'S/', 'FPS']

In [30]:
# for i in patterns:
#     df['clean'] = df['clean'].apply(remove_startswith(i))

In [31]:
# def remove_endswith(pattern):
#     return lambda words: words.replace(''.join([x for x in words.split() if x.endswith(pattern) and x[:-len(pattern)].isdigit()]), '')

In [32]:
# endswith_ = ['G', 'L', 'ML', 'MG' , 'UN', 'GR', 'CP', 'CPS', 'VOL', 'CAP', 'UNID', 'UNID.', 'KG']

In [33]:
# for i in endswith_:
#     df['clean'] = df['clean'].apply(remove_endswith(i))

In [34]:
# df.loc[[
#     10, 1371, 1526, 177283, 2252, # remove_pairs()
#     3542, 210239, # remove_metrics() 
#     12, 790, 2018, 208178, # remove_startswith()
#     8, 65, 3810, 4262, 29601, 54775, 210436, 211906, 213106, 816883, 34, # remove_endswith()
#     820265, 295406 # lambda x: x.startswith('(') and x.endswith('(') 
# ]]

---

In [35]:
# df[
#     df['name'].str.endswith('CP') 
#     # & df['name'].str.endswith(')')
# ]

In [36]:
# Tratar:
# METRONIDAZOL  GEL VAG BG 50G+10APLIC (NOVA QUIMICA)
# 38-952 KIT PROMO MILENA SH+CON+CR GRATIS (PADRÃO DE [9-9], AVALIAR CASOS)
# DESALEX 0,5MG XP

In [37]:
# Unique
# unique_ = ['COMP', 'CAPS', 'CAPS.', 'CX', 'CX.', 'INJ', 'UN', 'ML', 'PET', 'C', 'GFA']

In [38]:
# dict_ = {
#     'TX': 'TAXA',
#     'ABS': 'ABSORVENTE',
#     'ESP': 'ESPARADRAPO',
#     'SAB': 'SABONETE',
#     'DES': 'DESODORANTE',
#     'ESC': 'ESCOVA',
#     'LOC': 'LOCAO',
#     'TRAD': 'TRADICIONAL',
#     'CURAT': 'CURATIVO',
# }