# Part 1: Importing & Cleaning

In [6]:
import pandas as pd
import numpy as np

### Read Excel File Into DataFrame

In [15]:
df = pd.read_excel('/Users/aribina/Documents/Projects/Project 5/perfume_database.xlsx', usecols=['brand', 'perfume', 'notes'])

### Preview The DataFrame

In [10]:
df

Unnamed: 0,brand,perfume,notes
0,18 21 Man Made,Sweet Tobacco Spirits,"[""Citruses"", ""Saffron"", ""Tonka Bean"", ""Vanilla..."
1,40 Notes Perfume,Cashmere Musk,"[""Sandalwood"", ""Cedar"", ""White Musk"", ""Cashmer..."
2,40 Notes Perfume,Exotic Ylang Ylang,"[""Ylang-Ylang"", ""Gardenia"", ""Musk""]"
3,40 Notes Perfume,Exquisite Amber,"[""Labdanum"", ""Styrax"", ""Benzoin"", ""Vanilla"", ""..."
4,40 Notes Perfume,Oudwood Veil,"[""Kephalis"", ""Agarwood (Oud)""]"
...,...,...,...
37921,Urban Rituelle,Lemongrass Blend,"[""Lemongrass"", ""Myrtle"", ""Grapefruit"", ""Eucaly..."
37922,Urban Rituelle,Peach Blossom,"[""Peach"", ""Honey"", ""Sweet Pea"", ""Mimosa""]"
37923,Urban Rituelle,Pomegranate,"[""Pomegranate"", ""Citruses"", ""Red Berries""]"
37924,Urban Rituelle,Vanilla,"[""Vanilla"", ""Caramel"", ""Milk""]"


### Cleaning The Data

In [16]:
corpus = pd.DataFrame(df['notes'])

In [46]:
#Not all the perfumes have top, middle, and base notes categorized; So, I am dropping that component. 
#All notes will still be accounted for in a perfume -- just not categorized by top/middle/base.

items_to_remove = [
    '[', ']', '"', '{', '}',
    'middle: ', 'top: ', 'base: ', 'null'
]
def remove_items(text):
    for item in items_to_remove:
        text = text.replace(item, "")
    return text

In [18]:
# Cleaning up the text
corpus['notes'] = corpus['notes'].astype(str)
corpus['notes'] = corpus['notes'].str.lower()
corpus['notes'] = corpus['notes'].apply(remove_items)

In [19]:
df['notes'] = corpus['notes']
df.head()

Unnamed: 0,brand,perfume,notes
0,18 21 Man Made,Sweet Tobacco Spirits,"citruses, saffron, tonka bean, vanilla, exotic..."
1,40 Notes Perfume,Cashmere Musk,"sandalwood, cedar, white musk, cashmere wood"
2,40 Notes Perfume,Exotic Ylang Ylang,"ylang-ylang, gardenia, musk"
3,40 Notes Perfume,Exquisite Amber,"labdanum, styrax, benzoin, vanilla, musk"
4,40 Notes Perfume,Oudwood Veil,"kephalis, agarwood (oud)"


### Checking For Null Values

In [23]:
df.isna().sum()

brand      0
perfume    3
notes      0
dtype: int64

### Dropping 3 Rows with Nulls in The Perfume Column

In [27]:
df = df.dropna(subset=['perfume'])

In [29]:
df.describe()

Unnamed: 0,brand,perfume,notes
count,37923,37923,37923.0
unique,2571,33962,35331.0
top,Avon,Patchouli,
freq,644,39,957.0


### It seems like we have some suspicious "nan" values in the notes column. Lets investigate more.

In [34]:
# Convert 'notes' column to a Series
notes_series = df['notes']

# Get value counts for the 'notes' column
notes_value_counts = notes_series.value_counts()

# Print the value counts
print(notes_value_counts)

notes
nan                                                                                   957
floral notes                                                                           66
rose                                                                                   59
agarwood (oud)                                                                         58
lavender                                                                               55
                                                                                     ... 
green notes, floral notes, spicy notes                                                  1
oriental flower notes, spicy notes, green accord                                        1
bergamot, woodsy notes, oak moss, jasmine                                               1
lavender, sage, basil, vetiver, sandalwood, tonka bean, orange, lemon, green notes      1
vanilla, lavender, geranium                                                             1
Name

### Dropping 957 Rows Containing "nan" In The Notes Column, because "nan" is not a note!

In [35]:
# Drop rows with "nan" in the 'notes' column
df = df.drop(df[df['notes'] == 'nan'].index)

# Reset index if needed
df.reset_index(drop=True, inplace=True)

### Let's drop any duplicates of the same perfume

In [42]:
# Drop duplicates within the 'perfume' column 
df.drop_duplicates(subset='perfume', inplace=True)

#### Let's check out our clean dataframe

In [43]:
df

Unnamed: 0,brand,perfume,notes
0,18 21 Man Made,Sweet Tobacco Spirits,"citruses, saffron, tonka bean, vanilla, exotic..."
1,40 Notes Perfume,Cashmere Musk,"sandalwood, cedar, white musk, cashmere wood"
2,40 Notes Perfume,Exotic Ylang Ylang,"ylang-ylang, gardenia, musk"
3,40 Notes Perfume,Exquisite Amber,"labdanum, styrax, benzoin, vanilla, musk"
4,40 Notes Perfume,Oudwood Veil,"kephalis, agarwood (oud)"
...,...,...,...
36957,Urban Rituelle,Caramella,"caramel, coconut milk, cotton candy"
36958,Urban Rituelle,Coconut Lotus,"lotus, musk, coconut"
36960,Urban Rituelle,Island Blossom,"frangipani, wild berries, jasmine"
36961,Urban Rituelle,Lemongrass Blend,"lemongrass, myrtle, grapefruit, eucalyptus"


#### We started out with 37,926 rows and now we have 33,157 rows after cleaning. Thus, a total of 4,769 rows were dropped. 

In [45]:
# Save the DataFrame to a CSV file in the current directory
df.to_csv('clean_perfume_data.csv', index=False)