In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Import Modules
from collections import Counter
from pprint import pprint
import json
from bs4 import BeautifulSoup
from urllib.parse import urlencode
import re

from datetime import datetime, timedelta
import dill
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.patches as patches
import mplcyberpunk
import seaborn as sns
import scipy as sp
import scipy.stats as stats
from typing import List, Dict, Callable

# plt.style.use('ggplot')
plt.style.use("cyberpunk")

import os
os.getcwd()

'/home/jovyan/work/Capstone_3/Skincare-Recommender'

In [3]:
# Import Custom Modules
from src.helpers import *
from src.scrape_functions import *

In [4]:
# Load Time Consuming Scraping Session Variables to Translate to DataFrames  
dill.load_session('dill/notebook_env.db')

# 1: Drunk Elephant - Protini Polypeptide Cream
url_1, items_1, users_1 = item_1, item_info_1, user_info_1

# 2: TATCHA - The Water Cream
url_2, items_2, users_2 = item_2, item_info_2, user_info_2

# 3: BELIF - The True Cream Aqua Bomb
url_3, items_3, users_3 = item_3, item_info_3, user_info_3

# 4: BOSCIA - Cactus Water Moisturizer
url_4, items_4, users_4 = item_4, item_info_4, user_info_4

# 5: OLEHENRIKSEN - C-Rush™ Vita
url_5, items_5, users_5 = item_5, item_info_5, user_info_5

# 6: CLINIQUE - Dramatically Different Moisturizing Gel
url_6, items_6, users_6 = item_6, item_info_6, user_info_6

# 7: ORIGINS - Ginzing Energy Boosting Gel Moisturizer
url_7, items_7, users_7 = item_7, item_info_7, user_info_7

# 8: JOSIE MARAN - 100 Percent Pure Argan Oil
url_8, items_8, users_8 = item_8, item_info_8, user_info_8

# 9: FIRST AID BEAUTY - Ultra Repair® Cream Intense Hydration
url_9, items_9, users_9 = item_9, item_info_9, user_info_9

# 10: Dr. Jart - Cicapair™ Tiger Grass Color Correcting Treatment
url_10, items_10, users_10 = item_10, item_info_10, user_info_10

item_urls = [url_1, url_2, url_3, url_4, url_5, url_6, url_7, url_8, url_9, url_10]
item_collection = [items_1, items_2, items_3, items_4, items_5, items_6, items_7, items_8, items_9, items_10]
user_collection = [users_1, users_2, users_3, users_4, users_5, users_6, users_7, users_8, users_9, users_10]

In [5]:
# Fetch Prices!! Forgot this earlier
prices = [
    '$68.00',
    '$68.00',
    '$38.00',
    '$38.00',
    '$46.00',
    '$28.00',
    '$30.00',
    '$49.00',
    '$42.00',
    '$52.00'
]

## Create Item DataFrame

In [6]:
# Item Structure: item_url, item_brand, item_name, n_loves, avg_stars, ingredients, skin_target, n_reviews, five_stars, four_stars, three_stars, two_stars, one_stars, review_corpus

In [7]:
# Create item dataframe
item_df = pd.DataFrame({
    'item_url':item_urls,
    'item_brand':[x[0] for x in item_collection],
    'item_name':[x[1] for x in item_collection],
    'n_loves':[x[2] for x in item_collection],
    'avg_stars':[x[3] for x in item_collection], 
    'ingredient':[x[4] for x in item_collection], 
    'skin_target':[x[5] for x in item_collection], 
    'n_reviews':[x[6] for x in item_collection], 
    'star_dist':[x[7] for x in item_collection],
    'review_corp':[x[8] for x in item_collection]
})

# brand and name remaps
item_names = item_df['item_name'].unique().tolist()
name_map = dict(zip(item_names, [f'item_{i}' for i in range(1, len(item_names) + 1)]))
item_brands = item_df['item_brand'].unique().tolist()
brand_map = dict(zip(item_brands, [f'brand_{i}' for i in range(1, len(item_brands) + 1)]))

In [8]:
# Remap items to numbers
item_df['item_id'] = item_df['item_name'].map(name_map)

# Remap brands to numbers
item_df['item_brand_id'] = item_df['item_brand'].map(brand_map)

In [9]:
# Add Granulated star distribution
item_df['five_stars'] = item_df['star_dist'].apply(lambda x: int(x[0].partition(':')[2]))
item_df['four_stars'] = item_df['star_dist'].apply(lambda x: int(x[1].partition(':')[2]))
item_df['three_stars'] = item_df['star_dist'].apply(lambda x: int(x[2].partition(':')[2]))
item_df['two_stars'] = item_df['star_dist'].apply(lambda x: int(x[3].partition(':')[2]))
item_df['one_stars'] = item_df['star_dist'].apply(lambda x: int(x[4].partition(':')[2]))

item_df.drop(columns=['star_dist'], inplace=True)

In [10]:
# Cast n_loves, n_reviews, avg_stars as int
item_df['n_loves'] = item_df['n_loves'].apply(lambda x: re_max(x, int))
item_df['n_reviews'] = item_df['n_reviews'].apply(lambda x: re_max(x, int))
item_df['avg_stars'] = item_df['avg_stars'].apply(lambda x: float(x.split()[0]))

In [11]:
# Append prices 
item_df['price'] = [float(x.partition('$')[-1]) for x in prices]

In [12]:
# Parse and Clean skin targets 
item_df['skin_target'] = item_df['skin_target'].apply(lambda x: [re_parse_str(x).lower() for x in x.split(' ') if re_parse_str(x) or re_parse_str(x) != 'and'])

In [13]:
# Clean reviews corpus
item_df['review_corpus'] = item_df['review_corp'].apply(lambda x: [i.split('\n') for i in x])
item_df['review_corpus'] = item_df['review_corpus'].apply(lambda x: [parse_review_string(i) for i in x])

item_df.drop(columns=['review_corp'], inplace=True)

In [14]:
# Last Minute Patch for Row 4 (Clinique Product)
item_df.at[4, 'ingredient'] = '''Water\Aqua\Eau, Mineral Oil\Paraffinum Liquidum\Huile Minérale, Glycerin , Petrolatum , Stearic Acid , Glyceryl Stearate , Sesamum Indicum (Sesame) Oil, Urea, Lanolin Alcohol, Triethanolamine, Hordeum Vulgare (Barley) Extract\Extrait D'Orge, Cucumis Sativus (Cucumber) Fruit Extract, Helianthus Annuus (Sunflower) Seedcake, Propylene Glycol Dicaprate, Sodium Hyaluronate, Butylene Glycol, Pentylene Glycol, Trisodium Edta, Phenoxyethanol'''
item_df.at[7, 'ingredient'] = 'argan oil'

# Parse ingredients
item_df['ingredients'] = item_df['ingredient'].apply(lambda x: [x.strip().lower() for x in x.split(',')])
# Make Changes to ingredients to be evaluated as list
tmp = item_df['ingredients'].apply(clean_ingredients)
item_df['ingredients'] = tmp
item_df.drop(columns=['ingredient'], inplace=True)

# item_df['ingredients'][0]

In [15]:
# Create Encoding for Skin Type
item_df['normal'] = item_df['skin_target'].apply(lambda x: 1 if 'normal' in x else 0)
item_df['dry'] = item_df['skin_target'].apply(lambda x: 1 if 'dry' in x else 0)
item_df['combination'] = item_df['skin_target'].apply(lambda x: 1 if 'combination' in x else 0)
item_df['oily'] = item_df['skin_target'].apply(lambda x: 1 if 'oily' in x else 0)
item_df['sensitive'] = item_df['skin_target'].apply(lambda x: 1 if 'sensitive' in x else 0)

item_df.drop(columns=['skin_target'], inplace=True)

In [16]:
item_df.head()

Unnamed: 0,item_url,item_brand,item_name,n_loves,avg_stars,n_reviews,item_id,item_brand_id,five_stars,four_stars,...,two_stars,one_stars,price,review_corpus,ingredients,normal,dry,combination,oily,sensitive
0,https://www.sephora.com/product/protini-tm-pol...,Drunk Elephant,Protini™ Polypeptide Moisturizer,259473,4.0,4395,item_1,brand_1,2557,540,...,405,491,68.0,"[Soooo at first when I started using this, it ...","[1, 2-hexanediol, acetyl glutamine, alanine, a...",1,1,1,1,0
1,https://www.sephora.com/product/the-water-crea...,Tatcha,The Water Cream,233187,4.0,3030,item_2,brand_2,1817,392,...,232,282,68.0,[The gold spatula changed my life - I can't be...,"[alcohol, belamcanda chinensis root extract, b...",1,0,1,1,1
2,https://www.sephora.com/product/the-true-cream...,belif,The True Cream Aqua Bomb,191092,4.5,4527,item_3,brand_3,3276,665,...,173,167,38.0,[Completely wrong for combo/oily skin. This ca...,"[1, 2-hexanediol, acrylates, alchemilla vulgar...",1,1,1,1,0
3,https://www.sephora.com/product/cactus-water-m...,boscia,Cactus Water Moisturizer,26641,4.0,2221,item_4,brand_4,1263,538,...,108,134,38.0,[The BEST moisturizer if you’re trying to impr...,"[1, 2-hexanediol, aloe barbadensis flower extr...",1,0,1,1,0
4,https://www.sephora.com/product/dramatically-d...,OLEHENRIKSEN,C-Rush™ Vitamin C Gel Moisturizer,80124,4.5,2619,item_5,brand_5,1721,565,...,85,97,46.0,[It has a great orangey smell and it did hydra...,"[butylene glycol, cucumis sativus (cucumber) f...",1,1,1,1,1


## Create User DataFrame

In [17]:
# User Structure: item_url, item_brand, item_name ,user_id, user_meta, rating, review
user_collection[0][0] # id, meta, rating, reivew, brand, item_name

['harleen253',
 'Eye Color Brown\nHair color Black\nSkin Tone Medium\nSkin Type Oily',
 '5 stars',
 '3 h ago\nThis moisturizer has worked really well for my skin! I have combo skin and it doesn’t make me greasy. It keeps my skin plump and moisturized. I’ve gone through three of these now and how no intention of using any other moisturizer!\nRecommends this product',
 'Drunk Elephant',
 'Protini™ Polypeptide Moisturizer']

In [18]:
# Create user dataframe
user_df = pd.DataFrame({
    'user_id':[x[0] for product in user_collection for x in product],
    'user_meta':[x[1] for product in user_collection for x in product], 
    'star_rating':[x[2] for product in user_collection for x in product], 
    'review':[x[3] for product in user_collection for x in product],
    'item_brand':[x[4] for product in user_collection for x in product],
    'item_name':[x[5] for product in user_collection for x in product],
})

In [19]:
# Granulate and Clean User Data

# Cast star rating to int
user_df['star_rating'] = user_df['star_rating'].apply(lambda x: re_max(x, int) if type(x) != int else x)

# Remap items to numbers
user_df['item_id'] = user_df['item_name'].map(name_map)

# Remap brands to numbers
user_df['item_brand_id'] = user_df['item_brand'].map(brand_map)


In [20]:
# Create dict from user meta info
user_df['user_meta'] = user_df['user_meta'].apply(parse_user_meta)

In [21]:
# Granulate User Meta
user_df['age'] = user_df['user_meta'].apply(lambda x: x.get('age', None))
user_df['eye_color'] = user_df['user_meta'].apply(lambda x: x.get('eye color', None))
user_df['hair_color'] = user_df['user_meta'].apply(lambda x: x.get('hair color', None))
user_df['skin_tone'] = user_df['user_meta'].apply(lambda x: x.get('skin tone', None))
user_df['skin_type'] = user_df['user_meta'].apply(lambda x: x.get('skin type', None))

user_df.drop(columns=['user_meta'], inplace=True)

In [22]:
# Create timestamp element 
user_df['timestamp'] = user_df['review'].apply(lambda x: x.split('\n')[0])
# Convert and standardize all timestamps
user_df['timestamp'] = user_df['timestamp'].apply(create_timestamp)

In [23]:
# Create 'verified' feature
user_df['verified'] = user_df['review'].apply(lambda x: 1 if 'verified' in x.split('\n')[0].lower() else 0)

In [24]:
# Clean reviews corpus
reviews = user_df['review'].apply(lambda x: parse_review_string(x.split('\n')))
user_df['review'] = reviews

In [25]:
# Recode user_id to unique numerical identifier
user_id = user_df['user_id'].unique()
id_remap = dict(zip(user_id, range(len(user_id))))

user_df['user_id'] = user_df['user_id'].map(id_remap)

In [26]:
# Create Negative or Positive Sentiment Given Threshold
user_df['is_positive'] = user_df['star_rating'].apply(lambda x: 1 if x >= 4 else 0)

# Chemical Effects Dataset

In [27]:
# Import Chemical Data
chemicals = pd.read_csv('data/chemicals_effect.csv', sep=',', encoding='latin1')
# Decode to UTF-8
chemicals['chemicals'] = chemicals['chemicals'].apply(lambda x: x.encode('utf8').decode('utf8').lower())

In [28]:
# Remap chemical feature
features = chemicals.feature.unique().tolist()
remap = ['moisture', 'antioxidant', 'soothing', 'irritancy', 'brightening', 'viscosity', 'cleaning', 'fragrance']
remap_effect = dict(zip(features, remap)) 
chemicals['feature'] = chemicals['feature'].map(remap_effect)

In [29]:
# Create Effect Points Per Item

# First populate with nans
for effect in remap:
    item_df[effect] = 0
# Remap effect values to respective columns
set_map_effects(item_df, chemicals)


In [30]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   user_id        20000 non-null  int64         
 1   star_rating    20000 non-null  int64         
 2   review         20000 non-null  object        
 3   item_brand     20000 non-null  object        
 4   item_name      20000 non-null  object        
 5   item_id        20000 non-null  object        
 6   item_brand_id  20000 non-null  object        
 7   age            14432 non-null  object        
 8   eye_color      14432 non-null  object        
 9   hair_color     14432 non-null  object        
 10  skin_tone      14432 non-null  object        
 11  skin_type      14432 non-null  object        
 12  timestamp      20000 non-null  datetime64[ns]
 13  verified       20000 non-null  int64         
 14  is_positive    20000 non-null  int64         
dtypes: datetime64[ns](1

In [31]:
user_df.head()

Unnamed: 0,user_id,star_rating,review,item_brand,item_name,item_id,item_brand_id,age,eye_color,hair_color,skin_tone,skin_type,timestamp,verified,is_positive
0,0,5,This moisturizer has worked really well for my...,Drunk Elephant,Protini™ Polypeptide Moisturizer,item_1,brand_1,,brown,black,medium,oily,2020-06-04 21:00:00,0,1
1,1,2,"Soooo at first when I started using this, it b...",Drunk Elephant,Protini™ Polypeptide Moisturizer,item_1,brand_1,,hazel,brunette,medium,dry,2020-06-04 10:00:00,0,0
2,2,4,This is my go-to moisturizer. I have dry skin ...,Drunk Elephant,Protini™ Polypeptide Moisturizer,item_1,brand_1,,brown,black,medium,combination,2020-06-04 09:00:00,0,1
3,3,2,I got this as part of the birthday gift and us...,Drunk Elephant,Protini™ Polypeptide Moisturizer,item_1,brand_1,,,,,,2020-06-04 09:00:00,0,0
4,4,2,Alright. I bought this because I loved the tri...,Drunk Elephant,Protini™ Polypeptide Moisturizer,item_1,brand_1,,blue,blonde,fair,normal,2020-06-04 00:00:00,0,0


In [32]:
item_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 29 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   item_url       10 non-null     object 
 1   item_brand     10 non-null     object 
 2   item_name      10 non-null     object 
 3   n_loves        10 non-null     int64  
 4   avg_stars      10 non-null     float64
 5   n_reviews      10 non-null     int64  
 6   item_id        10 non-null     object 
 7   item_brand_id  10 non-null     object 
 8   five_stars     10 non-null     int64  
 9   four_stars     10 non-null     int64  
 10  three_stars    10 non-null     int64  
 11  two_stars      10 non-null     int64  
 12  one_stars      10 non-null     int64  
 13  price          10 non-null     float64
 14  review_corpus  10 non-null     object 
 15  ingredients    10 non-null     object 
 16  normal         10 non-null     int64  
 17  dry            10 non-null     int64  
 18  combination  

In [33]:
item_df

Unnamed: 0,item_url,item_brand,item_name,n_loves,avg_stars,n_reviews,item_id,item_brand_id,five_stars,four_stars,...,oily,sensitive,moisture,antioxidant,soothing,irritancy,brightening,viscosity,cleaning,fragrance
0,https://www.sephora.com/product/protini-tm-pol...,Drunk Elephant,Protini™ Polypeptide Moisturizer,259473,4.0,4395,item_1,brand_1,2557,540,...,1,0,9,2,0,1,1,1,2,0
1,https://www.sephora.com/product/the-water-crea...,Tatcha,The Water Cream,233187,4.0,3030,item_2,brand_2,1817,392,...,1,1,7,2,0,1,1,2,0,0
2,https://www.sephora.com/product/the-true-cream...,belif,The True Cream Aqua Bomb,191092,4.5,4527,item_3,brand_3,3276,665,...,1,0,5,1,0,0,0,5,1,2
3,https://www.sephora.com/product/cactus-water-m...,boscia,Cactus Water Moisturizer,26641,4.0,2221,item_4,brand_4,1263,538,...,1,0,3,3,0,0,1,1,0,0
4,https://www.sephora.com/product/dramatically-d...,OLEHENRIKSEN,C-Rush™ Vitamin C Gel Moisturizer,80124,4.5,2619,item_5,brand_5,1721,565,...,1,1,4,0,0,0,0,2,0,0
5,https://www.sephora.com/product/dramatically-d...,CLINIQUE,Dramatically Different Moisturizing Gel,84638,4.5,3787,item_6,brand_6,2624,661,...,1,0,4,2,0,0,0,2,0,0
6,https://www.sephora.com/product/ginzing-energy...,Origins,GinZing™ Oil- Free Energy Boosting Gel Moistur...,14573,4.5,3294,item_7,brand_7,2058,724,...,1,0,8,2,0,1,0,2,1,3
7,https://www.sephora.com/product/100-percent-pu...,Josie Maran,100 percent Pure Argan Oil,125396,4.5,7317,item_8,brand_8,5326,1144,...,1,1,1,0,0,0,0,0,0,0
8,https://www.sephora.com/product/ultra-repair-c...,First Aid Beauty,Ultra Repair® Cream Intense Hydration,182275,4.5,6751,item_9,brand_9,5203,826,...,0,0,4,1,2,1,0,2,0,0
9,https://www.sephora.com/product/cicapair-tiger...,Dr. Jart+,Cicapair™ Tiger Grass Color Correcting Treatme...,96715,4.0,2829,item_10,brand_10,1707,480,...,1,0,5,5,1,2,1,0,0,0


In [34]:
# chemicals[chemicals['chemicals'].str.contains('argan')]

In [35]:
# Write to csv file
item_df.to_csv('data/item_df.csv')
user_df.to_csv('data/user_df.csv')
chemicals.to_csv('data/chemical_effects.csv')