In [1]:
import pandas as pd
import numpy as np
import toolkit
import nltk
import os

from textblob import Word
from IPython.core.display import HTML

display(HTML("<style>pre { white-space: pre !important; }</style>"))
toolkit.set_default_pandas_options()

In [2]:
dtypes = {'Name': str, 'Brand': str, 'Description': str, 'Notes': str, 'Image URL': str}
df = pd.read_csv('final_perfume_data.csv', dtype=dtypes,encoding="ISO-8859-1")
df = df.drop(['Image URL'], axis=1)
df

Unnamed: 0,Name,Brand,Description,Notes
0,Tihota Eau de Parfum,Indult,"Rapa Nui for sugar, Tihota is, quite simply, ...","Vanilla bean, musks"
1,Sola Parfum,Di Ser,A tribute to the expanse of space extending f...,"Lavender, Yuzu, Lemongrass, Magnolia, Geraniu..."
2,Kagiroi Parfum,Di Ser,An aromatic ode to the ancient beauty of Japa...,"Green yuzu, green shikuwasa, sansho seed, cor..."
3,Velvet Fantasy Eau de Parfum,Montale,Velvet Fantasy is a solar fragrance where cit...,"tangerine, pink pepper, black coffee, leat..."
4,A Blvd. Called Sunset Eau de Parfum,A Lab on Fire,There's no way A Lab On Fire could relocate t...,"Bergamot, almond, violet, jasmine, leather, s..."
...,...,...,...,...
2186,Perfect Veil Eau de Parfum,Sarah Horowitz Parfums,"This was created to smell like clean, naked s...","top: lemon, bergamot base: musk, vanilla, san..."
2187,Scent Eau de Parfum,Costume National,Scent is Costume Nationals debut fragrance. W...,"amber, jasmine tea, mother of pearl hibiscus ..."
2188,Bronze Eau de Parfum,Nanadebary,"Bronze is a naughty little vixen of a scent, ...","bergamot, mandarine, cinnamon, jasmine, iris,..."
2189,Monyette Paris Fragrance Oil,Monyette Paris,"This isn't just sexy, it's big night out se...","Tahitian gardenia, French muguet du bois, hin..."


In [3]:
print(toolkit.data_quality_report(df))

                                 Name                   Brand                                        Description                                              Notes
count                            2191                    2191                                               2191                                               2111
unique                           2184                     249                                               2167                                               2053
top             Black Eau de Toilette  TOM FORD Private Blend   Every once in a while we encounter a fragranc...   Bergamot, lemon, neroli, african marigold, bu...
freq                                2                      39                                                  2                                                  3
Data Type                      object                  object                                             object                                             object
Missing Values  

In [4]:
#there are 80 rows with missing notes
df = df.replace(r'^(\s?)+$', np.nan, regex=True)
df = df.dropna().reset_index(drop=True)
print(toolkit.data_quality_report(df))

                                          Name                   Brand                                        Description                                              Notes
count                                     2111                    2111                                               2111                                               2111
unique                                    2104                     244                                               2087                                               2053
top             New York Intense Eau de Parfum  TOM FORD Private Blend   Everyone knows someone like Miss Marisa. Pret...   Bergamot, lemon, neroli, african marigold, bu...
freq                                         2                      36                                                  2                                                  3
Data Type                               object                  object                                             object              

In [5]:
#lowercase perfume name
df['Name'] = df['Name'].str.lower()

#remove those non-perfume products like oil and hair lotion
df = df[~df['Name'].str.contains('oil')]
df = df[~df['Name'].str.contains('hair')]

print(toolkit.data_quality_report(df))

                                Name         Brand                                        Description                                              Notes
count                           1836          1836                                               1836                                               1836
unique                          1830           223                                               1833                                               1826
top             london eau de parfum  Serge Lutens   Dedicated to the cradle of the great civiliza...   Air note, orange absolute, bergamot, juniper,...
freq                               2            36                                                  2                                                  2
Data Type                     object        object                                             object                                             object
Missing Values                     0             0                                

In [6]:
# Select duplicate rows using Name except first occurrence
duplicateRowsDF = df[df.duplicated(['Name'])]
duplicateRowsDF

Unnamed: 0,Name,Brand,Description,Notes
888,hindu kush eau de parfum,Mancera,"Rugged, imposing, and central to military his...","Saffron, cumin, cloves, black pepper, labdanu..."
1154,london eau de parfum,Gallivant,London- a massive city that sometimes feels l...,"Cucumber, violet leaves, rose de mai, rose oi..."
1518,new york intense eau de parfum,PARFUMS DE NICOLAI,A classic from Nicolai now available in INTEN...,"Bergamot, Sicilian lemon, cloves, thyme, cinn..."
1576,rose oud eau de parfum,PARFUMS DE NICOLAI,Rose Oud opens with a lush and spicy raspberr...,"Raspberry, Davana, Osmanthus, Rose, Lily of t..."
1775,dark eau de parfum,Andrea Maack Parfums,Dark is heavily oxidized rose mixed with viol...,"Yellow mandarin, pink berries, petitgrain, le..."
2009,gardenia eau de parfum,Isabey,There are moments in a woman's life when the ...,"tangerine bark, ylang-ylang, orange flowers, ..."


In [7]:
#remove duplicated content based on the same product names
df = df.drop([888,1154,1518,1576,1775,2009], axis=0).reset_index(drop=True)
print(toolkit.data_quality_report(df))

                                  Name         Brand                                        Description                                              Notes
count                             1830          1830                                               1830                                               1830
unique                            1830           223                                               1827                                               1820
top             miksado parfum extrait  Serge Lutens   Dedicated to the cradle of the great civiliza...   Grapefruit, Citrus, Tangerine, Eucalyptus, Nu...
freq                                 1            36                                                  2                                                  2
Data Type                       object        object                                             object                                             object
Missing Values                       0             0                  

In [8]:
#exclude "no" and "not" and processing the normalization
stopword_exclusion_list = ['no', 'not']
toolkit.exclude_stopwords(stopword_exclusion_list)

cleaned_df = toolkit.normalize_corpus(df, 'Description', 'Cleaned_Description', html_stripping=True,
                        accented_char_removal=True, contraction_expansion=True,
                        text_lower_case=True, extra_newlines_removal=True, extra_whitespace_removal=True,
                        special_char_removal=True, remove_digits=False, repeating_char_removal=False,
                        spelling_correction=False, lemmatize=True, stop_word_removal=True)


In [9]:
print(toolkit.data_quality_report(cleaned_df))

                                  Name         Brand                                        Description                                              Notes                                Cleaned_Description
count                             1830          1830                                               1830                                               1830                                               1830
unique                            1830           223                                               1827                                               1820                                               1827
top             miksado parfum extrait  Serge Lutens   Dedicated to the cradle of the great civiliza...   Grapefruit, Citrus, Tangerine, Eucalyptus, Nu...  capri perfume par excellence mediterranean not...
freq                                 1            36                                                  2                                                  2                      

In [10]:
#removing empty cleaned descriptions, if any
cleaned_df['Cleaned_Description'].replace(' ', np.nan, inplace=True)
cleaned_df.dropna(subset=['Cleaned_Description'], inplace=True)
print(toolkit.data_quality_report(cleaned_df))

                                  Name         Brand                                        Description                                              Notes                                Cleaned_Description
count                             1830          1830                                               1830                                               1830                                               1830
unique                            1830           223                                               1827                                               1820                                               1827
top             miksado parfum extrait  Serge Lutens   Dedicated to the cradle of the great civiliza...   Grapefruit, Citrus, Tangerine, Eucalyptus, Nu...  capri perfume par excellence mediterranean not...
freq                                 1            36                                                  2                                                  2                      

In [11]:
cleaned_df.to_csv('cleaned_perfume_data.csv',index=False)