#### Initial configurations

In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv('inputs/data_storeproduct_202208111119.csv', usecols = ['id', 'name'])

In [3]:
df.shape

(820512, 2)

In [4]:
df.drop_duplicates(subset = 'name', inplace = True)

In [5]:
df.shape

(306052, 2)

In [6]:
df['name'] = df['name'].str.upper()

In [7]:
df['clean'] = df['name']

---

### Removes parentheses

In [8]:
df['clean'] = df['clean'].apply(lambda x: re.sub(r'\([^()]*\)', '', x) if x else x)

In [9]:
# # Tested Items
# df.loc[[743024, 270107, 210205, 295409, 295406, 764575, 208980, 205859, 764574, 796994, 744286]]

---

### Removes initial digits

In [10]:
def clean_initial_digits(name):
    try:
        word = name.split()[0]
        digits = sum(l.isdigit() for l in word)
        
        if digits >= len(word) - digits:
            return name.replace(word, '')
        else:
            return name
    except:
        return name

In [11]:
df['clean'] = df['clean'].apply(lambda x: clean_initial_digits(x))

In [12]:
# # Tested Items
# df.loc[[638551, 337465, 337317, 32411, 557546]]

---

### Removes initial characters

In [13]:
def clean_initial_characters(name):
    chars = ['!', '%', '*', '+', ',', '-', '.', '/', '<', '=']
    to_remove = []

    try:
        word = name.split()[0]
        for letter in word:
            if letter in chars:
                to_remove.append(letter)
        
        if len(to_remove) > 0:
            cleaned = ''.join([w for w in word if w not in to_remove])
            name = name.split()[1:]
            name.insert(0, cleaned)
            
            return ' '.join(name)
        else:
            return name
    except:
        return name

In [14]:
df['clean'] = df['clean'].apply(lambda x: clean_initial_characters(x))

In [15]:
# # Tested Items
# df.loc[[481395, 295715, 755477, 178663, 484751, 239320, 28118, 280929, 170862, 106470]]

---

### Removes units

In [16]:
def clean_units(name):
    units = [
        'U', 'G', 'V', 'P', 'L', 'M', 
        'GR', 'KG', 'ML', 'UN', 'CM', 'MG', 'CP', 'MM', 'CM', 'PC', 
        'CPS', 'CPR', 'VOL', 'CAP', 'UND', 'GRS', 
        'CAPS', 'UNID', 'UNDS', 'CPRS', 'COMP', 
        'UNID.', 'SACHES' 
    ]
    to_remove = []
    words = name.split()

    for word in words:
        for unit in units:
            if word == unit:
                to_remove.append(word)
                
                try:
                    previous_word = words[words.index(word) - 1]
                    digits = sum(c.isdigit() for c in previous_word)

                    if digits >= len(previous_word) - digits:
                        to_remove.append(previous_word)
                except:
                    pass

            if word[-len(unit):] == unit:
                qty = word[:-len(unit)]
                digits = sum(c.isdigit() for c in qty)
                
                if digits >= len(qty) - digits:
                    to_remove.append(word)

    return ' '.join([word for word in words if word not in to_remove])

In [17]:
df['clean'] = df['clean'].apply(lambda x: clean_units(x))

In [18]:
# # Tested Items
# df.loc[[
#     363199, 635298, 40689,  740467, 213141, 679374, 288780, 810939, 510231, 648258, 
#     704156, 178001, 413533, 785982, 340359, 211457, 565330, 739959, 491207, 598294, 
#     212840, 585916, 575283, 786004, 545491, 646295, 734862, 353142, 51607,  624800, 
#     256973, 713142, 536849, 165346, 214062, 685358, 520570, 120444, 685071, 712342, 
#     484879, 685358, 765239, 658096, 729729, 545046, 507365, 444408, 29568,  168168, 
#     220269, 238971, 487450, 726061, 696019, 122934, 649598, 350363
# ]]

---

### Removes abbreviations

In [19]:
def clean_abbrs(name):
    abbrs = ['REF', 'REF.', 'C/', 'S/', 'P/', 'FPS']
    to_remove = []
    words = name.split()

    for word in words:
        if word in abbrs:
            to_remove.append(word)

            try:
                to_remove.append(words[words.index(word) + 1])
            except:
                pass
            
        elif word.startswith(tuple(abbrs)):
            for abbr in abbrs:
                if abbr == 'REF' == word[:3] and word[3] != '.' and not word[len(abbr):][0].isdigit():
                    break
                else:
                    to_remove.append(word)

    return ' '.join([word for word in words if word not in to_remove])

In [20]:
df['clean'] = df['clean'].apply(lambda x: clean_abbrs(x))

In [21]:
# # Tested Items
# df.loc[[
#     507889, 308443, 814522, 708421, 800889, 773057, 671045, 583425, 582548, 700178, 
#     318441, 318442, 214566, 632491, 480583, 733038, 407674, 658417, 310619, 648318
# ]]

---

### Removes initial digits and characters

In [22]:
df['clean'] = df['clean'].apply(lambda x: clean_initial_digits(x))

In [23]:
df['clean'] = df['clean'].apply(lambda x: clean_initial_characters(x))

---

### Removes isolated words

In [24]:
def clean_isolated(name):
    isolated = [
        'CR', 'COMP.', 'COMPR', 'COMPRIMIDOS', 
        'CAP.', 'CAPS.', 'CAPSULA', 'CAPSULAS',
        'CX.', 'CX', 'CAIXA',
        '*', '-'
    ]
    to_remove = []
    words = name.split()

    for word in words:
        if word in isolated:
            to_remove.append(word)

    return ' '.join([word for word in words if word not in to_remove])

In [25]:
df['clean'] = df['clean'].apply(lambda x: clean_isolated(x))

In [26]:
# # Tested Items
# df.loc[[413099, 208413, 291647, 239129, 416768, 504305, 173425, 804124, 239195, 175181, 634522, 211018]]

---

### Sets abbreviations

In [27]:
def set_abbr(name):
    abbrs = {
        'SH.': 'SHAMPOO', 'SH': 'SHAMPOO',
        'ABS.': 'ABSORVENTE', 'ABS': 'ABSORVENTE',
        'SAB.': 'SABONETE', 'SAB': 'SABONETE',
        'LIQ.': 'LIQUIDO', 'LIQ': 'LIQUIDO',
        'DES.': 'DESODORANTE', 'DES': 'DESODORANTE',
        'ESC.': 'ESCOVA', 'ESC': 'ESCOVA',
        'LOC.': 'LOCAO', 'LOC': 'LOCAO',
        'HID.': 'HIDRATANTE', 'HID': 'HIDRATANTE', 
        'ESM.': 'ESMALTE', 'ESM': 'ESMALTE',
        'PERF.': 'PERFUME', 'PERF': 'PERFUME',
        'TRAD.': 'TRADICIONAL', 'TRAD': 'TRADICIONAL',
        'COND.': 'CONDICIONADOR',
        'COND': 'CONDICIONADOR',
        'HIDRAT.': 'HIDRATANTE', 'HIDRAT': 'HIDRATANTE',
    }
    words = name.split()

    for word in words:
        if word in abbrs:
            words[words.index(word)] = words[words.index(word)].replace(word, abbrs[word])

    return ' '.join([word for word in words])

In [28]:
df['clean'] = df['clean'].apply(lambda x: set_abbr(x))

In [37]:
# # Tested Items
# df.loc[[
#     648322, 684625, 416792, 66392,  685360, 370427, 607499, 548937, 
#     295447, 31803,  638638, 698653, 60724,  550665, 654800, 680490, 
#     652319, 703964, 525718, 31137,  675719, 639881, 667164, 312965, 666810
# ]]

---

### Creates a CSV file with cleaned names

In [30]:
df['clean'] = df['clean'].str.strip()

In [31]:
df['had_changes'] = df['clean'] != df['name']

In [32]:
df['num_words'] = df['clean'].apply(lambda x: len(str(x).split()))

In [33]:
df.sort_values(by = 'clean', inplace = True)

In [34]:
df.to_csv('outputs/clean.csv', index = False)

---

### Metrics

In [35]:
print(f"Items\n{'-' * len('Items')}")
print(f"Total: {df.shape[0]}")
print(f"Changed: {len(df[df['had_changes'] == True])}")
print(f"Not Changed: {len(df[df['had_changes'] == False])}")

# TOTAL     CHANGED     NOT CHANGED
# 306.052   36.829      269.223
# 306.052   91.447      214.605
# 306.052   232.885     73.167
# 306.052   234.920     71.132
# 306.052   243.443     62.609
# 306.052   251.527     54.525

Items
-----
Total: 306052
Changed: 251527
Not Changed: 54525


In [36]:
print(f"Unique Items\n{'-' * len('Unique Items')}")
print(f"Original dataset: {df['name'].nunique()}")
print(f"Cleaned dataset: {df['clean'].nunique()}")
print(f"Unified items: {df['name'].nunique() - df['clean'].nunique()}")

# ORIGINAL      CLEANED     UNIFIED
# 306.051       303.710     2.341
# 306.051       297.114     8.937
# 306.051       259.696     46.355
# 306.051       256.832     49.219
# 306.051       249.716     56.335
# 306.051       247.646     58.405

Unique Items
------------
Original dataset: 306051
Cleaned dataset: 247646
Unified items: 58405
