In [116]:
# Dependencies
import pandas as pd

In [117]:
# Load csv file
perfume_dataset = pd.read_csv("perfume.csv")

In [118]:
# Let us look at the dataset
perfume_dataset.head(4)

Unnamed: 0,brand,title,date,accords,rating_score,votes,longevity_poor,longevity_weak,longevity_moderate,longevity_long,...,notes_12,notes_13,notes_14,notes_15,notes_16,notes_17,notes_18,notes_19,notes_20,gender
0,The-Spirit-of-Dubai,Aamal The Spirit of Dubai for women and men,2017,"woody,earthy,animalic,amber,musky,balsamic",5.0,3,0,0,0,0,...,Base3Moss,Base4Agarwood (Oud),Base5Indian Oud,,,,,,,women
1,Ajmal,Aatifa Ajmal for women and men,2014,"fresh spicy,woody,musky,rose,amber",4.2,10,1,0,0,0,...,,,,,,,,,,women
2,Al-Jazeera-Perfumes,AA Al-Jazeera Perfumes for women and men,0,"rose,woody,musky,oud,fruity",0.0,0,0,0,0,0,...,,,,,,,,,,women
3,Art-of-Scent-Swiss-Perfumes,aarewasser Art of Scent - Swiss Perfumes for w...,2010,"white floral,green,ozonic,fresh,animalic",0.0,1,0,0,0,0,...,,,,,,,,,,women


In [119]:
# We want to only keep the rows that are of interest in a smaller dataframe. In this case they are
# brand, title, date, accords, rating_score, votes, clswinter, clsspring, clssummer, clsautumn, clsday, clsnight, and gender
frag_reduced = perfume_dataset[["brand","title", "date", "accords", "rating_score", "votes", "clswinter",
                                "clsspring", "clssummer", "clsautumn", "clsday", "clsnight", "gender"]]
frag_reduced.head()

Unnamed: 0,brand,title,date,accords,rating_score,votes,clswinter,clsspring,clssummer,clsautumn,clsday,clsnight,gender
0,The-Spirit-of-Dubai,Aamal The Spirit of Dubai for women and men,2017,"woody,earthy,animalic,amber,musky,balsamic",5.0,3,50,50,50,50,50,50,women
1,Ajmal,Aatifa Ajmal for women and men,2014,"fresh spicy,woody,musky,rose,amber",4.2,10,100,60,60,60,80,80,women
2,Al-Jazeera-Perfumes,AA Al-Jazeera Perfumes for women and men,0,"rose,woody,musky,oud,fruity",0.0,0,1,1,1,1,1,1,women
3,Art-of-Scent-Swiss-Perfumes,aarewasser Art of Scent - Swiss Perfumes for w...,2010,"white floral,green,ozonic,fresh,animalic",0.0,1,1,100,100,100,100,100,women
4,Hamidi-Oud-Perfumes,Aaliyah Hamidi Oud & Perfumes for women and men,0,"woody,warm spicy,amber,balsamic,musky",0.0,2,100,100,100,100,100,100,women


In [120]:
# First we would like to see how many fragrances there are for each gender listed
fragrance_count = frag_reduced["gender"].value_counts()
fragrance_count

women     39761
man         850
unisex      122
Name: gender, dtype: int64

In [121]:
# let us create a dataframe for these three gender counts
women_count = fragrance_count["women"]
man_count = fragrance_count["man"]
unisex = fragrance_count["unisex"]
Fragrance_Gender_Counts = pd.DataFrame({"Total Women Fragrances": [women_count],
              "Total Men Fragrances": [man_count],
             "Total Unisex Fragrances": [unisex]
             })
Fragrance_Gender_Counts

Unnamed: 0,Total Women Fragrances,Total Men Fragrances,Total Unisex Fragrances
0,39761,850,122


In [122]:
# Now, we will do score analysis, and see what are the averages given for each category
total_rating_avg = frag_reduced["rating_score"].mean()
total_votes_avg = frag_reduced["votes"].mean()
total_winter_avg = frag_reduced["clswinter"].mean()
total_spring_avg = frag_reduced["clsspring"].mean()
total_summer_avg = frag_reduced["clssummer"].mean()
total_fall_avg = frag_reduced["clsautumn"].mean()
total_day_avg = frag_reduced["clsday"].mean()
total_night_avg = frag_reduced["clsnight"].mean()

In [139]:
# we will put the average results in a dataframe
Ratings_Averages = pd.DataFrame({"Total Votes Average": [total_votes_avg],
              "Total Rating Average (0-5)": [total_rating_avg],
              "Total Rating Winter Average (0-100)": [total_winter_avg],
              "Total Rating Spring Average (0-100)": [total_spring_avg],
              "Total Rating Summer Average (0-100)": [total_summer_avg],\
              "Total Rating Fall Average (0-100)": [total_fall_avg],
              "Total Rating Day Average (0-100)": [total_day_avg],
              "Total Rating Night Average (0-100)": [total_night_avg]
}).round(2)
Ratings_Averages

Unnamed: 0,Total Votes Average,Total Rating Average (0-5),Total Rating Winter Average (0-100),Total Rating Spring Average (0-100),Total Rating Summer Average (0-100),Total Rating Fall Average (0-100),Total Rating Day Average (0-100),Total Rating Night Average (0-100)
0,86.44,3.19,25.36,35.09,27.63,33.19,47.21,31.7


In [157]:
# First we want to see the score distribution overall for fragrances from 0 to 5
# Establish the bins 
score_bins_5 = [0.00,0.90, 1.90, 2.90, 3.90, 999]
group_names_5 = ["<1", "1-1.9", "2-2.9", "3-3.9", "4-5"]

In [158]:
# Now we will apply the segmentation to the distribution
Total_Rating_Distribution = pd.cut(frag_reduced["rating_score"], score_bins_5, labels=group_names_5).value_counts()
Total_Rating_Distribution_DF = pd.DataFrame({"Overall Fragrances With Rating": Total_Rating_Distribution}).sort_index()
Total_Rating_Distribution_DF

Unnamed: 0,Overall Fragrances With Rating
<1,0
1-1.9,871
2-2.9,1742
3-3.9,14878
4-5,24414


In [159]:
frag_reduced["rating_score"].sum()

163488.03

In [156]:
Total_Rating_Distribution_DF["Overall Fragrances With Rating"].sum()

41905

In [126]:
# Now we want to see the score distribution of fragrances by spring, summer, fall, day, and night
# Establish the bins 
score_bins = [0,9.9, 19.9, 29.9, 39.9, 49.9, 59.9, 69.9, 79.9, 89.9, 100]
group_names = ["<10", "10-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70-79", "80-89","90-100"]

In [127]:
# Now we will apply the segmentation to the distribution for spring
Total__Spring_Distribution = pd.cut(frag_reduced["clsspring"], score_bins, labels=group_names).value_counts()
Total_Spring_Distribution_DF = pd.DataFrame({"Spring Fragrances With Rating":Total__Spring_Distribution}).sort_index()
Total_Spring_Distribution_DF

Unnamed: 0,Spring Fragrances With Rating
<10,16582
10-19,3628
20-29,5047
30-39,5052
40-49,3802
50-59,5811
60-69,3153
70-79,1384
80-89,891
90-100,5862


In [128]:
# Now we will apply the segmentation to the distribution for summer
Total_Distribution_Summer = pd.cut(frag_reduced["clssummer"], score_bins, labels=group_names).value_counts()
Total_Summer_Distribution_DF = pd.DataFrame({"Summer Fragrances With Rating":Total_Distribution_Summer}).sort_index()
Total_Summer_Distribution_DF

Unnamed: 0,Summer Fragrances With Rating
<10,21913
10-19,4787
20-29,4806
30-39,4083
40-49,2602
50-59,4480
60-69,2507
70-79,1234
80-89,860
90-100,3940


In [129]:
# Now we will apply the segmentation to the distribution for fall
Total_Distribution_Fall = pd.cut(frag_reduced["clsautumn"], score_bins, labels=group_names).value_counts()
Total_Fall_Distribution_DF = pd.DataFrame({"Fall Fragrances With Rating":Total_Distribution_Fall}).sort_index()
Total_Fall_Distribution_DF

Unnamed: 0,Fall Fragrances With Rating
<10,17246
10-19,4486
20-29,5368
30-39,5224
40-49,3523
50-59,5355
60-69,2459
70-79,1031
80-89,642
90-100,5878


In [130]:
# Now we will apply the segmentation to the distribution for winter
Total_Distribution_Winter = pd.cut(frag_reduced["clswinter"], score_bins, labels=group_names).value_counts()
Total_Winter_Distribution_DF = pd.DataFrame({"Winter Fragrances With Rating":Total_Distribution_Winter}).sort_index()
Total_Winter_Distribution_DF

Unnamed: 0,Winter Fragrances With Rating
<10,23967
10-19,4842
20-29,4768
30-39,4074
40-49,2456
50-59,3925
60-69,1753
70-79,715
80-89,438
90-100,4274


In [131]:
# Now we will apply the segmentation to the distribution for Day
Total_Distribution_Day = pd.cut(frag_reduced["clsday"], score_bins, labels=group_names).value_counts()
Total_Day_Distribution_DF = pd.DataFrame({"Day Fragrances With Rating":Total_Distribution_Day}).sort_index()
Total_Day_Distribution_DF

Unnamed: 0,Day Fragrances With Rating
<10,13044
10-19,1223
20-29,3133
30-39,3887
40-49,3575
50-59,6750
60-69,4940
70-79,2870
80-89,2052
90-100,9738


In [135]:
# Now we will apply the segmentation to the distribution for Night
Total_Distribution_Night = pd.cut(frag_reduced["clsday"], score_bins, labels=group_names).value_counts()
Total_Night_Distribution_DF = pd.DataFrame({"Night Fragrances With Rating":Total_Distribution_Night}).sort_index()
Total_Night_Distribution_DF

Unnamed: 0,Night Fragrances With Rating
<10,13044
10-19,1223
20-29,3133
30-39,3887
40-49,3575
50-59,6750
60-69,4940
70-79,2870
80-89,2052
90-100,9738


In [136]:
# Now we will merge all the dataframes for the rating distributions into one with inner join by index
Merge_Day_Night = pd.merge(Total_Night_Distribution_DF, Total_Day_Distribution_DF, left_index=True, right_index=True)
Merge_Winter_Fall = pd.merge(Total_Winter_Distribution_DF, Total_Fall_Distribution_DF, left_index=True, right_index=True)
Merge_Summer_spring = pd.merge(Total_Summer_Distribution_DF, Total_Spring_Distribution_DF, left_index=True, right_index=True)
Merge_Day_Night_Winter_Fall = pd.merge(Merge_Day_Night, Merge_Winter_Fall, left_index=True, right_index=True)
All_Ratings_Merged = pd.merge(Merge_Day_Night_Winter_Fall, Merge_Summer_spring, left_index=True, right_index=True)

In [137]:
All_Ratings_Merged

Unnamed: 0,Night Fragrances With Rating,Day Fragrances With Rating,Winter Fragrances With Rating,Fall Fragrances With Rating,Summer Fragrances With Rating,Spring Fragrances With Rating
<10,13044,13044,23967,17246,21913,16582
10-19,1223,1223,4842,4486,4787,3628
20-29,3133,3133,4768,5368,4806,5047
30-39,3887,3887,4074,5224,4083,5052
40-49,3575,3575,2456,3523,2602,3802
50-59,6750,6750,3925,5355,4480,5811
60-69,4940,4940,1753,2459,2507,3153
70-79,2870,2870,715,1031,1234,1384
80-89,2052,2052,438,642,860,891
90-100,9738,9738,4274,5878,3940,5862
