In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Import Modules
from pprint import pprint
import json
from bs4 import BeautifulSoup
from urllib.parse import urlencode
import re

from datetime import datetime, timedelta
import dill
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.patches as patches
import mplcyberpunk
import seaborn as sns
import scipy as sp
import scipy.stats as stats
from typing import List, Dict, Callable

# plt.style.use('ggplot')
plt.style.use("cyberpunk")

import os
os.getcwd()

'/home/jovyan/work/Capstone_3/Skincare-Recommender'

In [3]:
# Import Custom Modules
from src.helpers import *
from src.scrape_functions import *

In [4]:
# Load Time Consuming Scraping Session Variables to Translate to DataFrames  
dill.load_session('dill/notebook_env.db')

# 1: Drunk Elephant - Protini Polypeptide Cream
url_1, items_1, users_1 = item_1, item_info_1, user_info_1

# 2: TATCHA - The Water Cream
url_2, items_2, users_2 = item_2, item_info_2, user_info_2

# 3: BELIF - The True Cream Aqua Bomb
url_3, items_3, users_3 = item_3, item_info_3, user_info_3

# 4: BOSCIA - Cactus Water Moisturizer
url_4, items_4, users_4 = item_4, item_info_4, user_info_4

# 5: OLEHENRIKSEN - C-Rush™ Vita
url_5, items_5, users_5 = item_5, item_info_5, user_info_5

# 6: CLINIQUE - Dramatically Different Moisturizing Gel
url_6, items_6, users_6 = item_6, item_info_6, user_info_6

# 7: ORIGINS - Ginzing Energy Boosting Gel Moisturizer
url_7, items_7, users_7 = item_7, item_info_7, user_info_7

# 8: JOSIE MARAN - 100 Percent Pure Argan Oil
url_8, items_8, users_8 = item_8, item_info_8, user_info_8

# 9: FIRST AID BEAUTY - Ultra Repair® Cream Intense Hydration
url_9, items_9, users_9 = item_9, item_info_9, user_info_9

# 10: Dr. Jart - Cicapair™ Tiger Grass Color Correcting Treatment
url_10, items_10, users_10 = item_10, item_info_10, user_info_10

item_urls = [url_1, url_2, url_3, url_4, url_5, url_6, url_7, url_8, url_9, url_10]
item_collection = [items_1, items_2, items_3, items_4, items_5, items_6, items_7, items_8, items_9, items_10]
user_collection = [users_1, users_2, users_3, users_4, users_5, users_6, users_7, users_8, users_9, users_10]

In [5]:
# Fetch Prices!! Forgot this earlier
prices = [
    '$68.00',
    '$68.00',
    '$38.00',
    '$38.00',
    '$46.00',
    '$28.00',
    '$30.00',
    '$49.00',
    '$42.00',
    '$52.00'
]

## Create Item DataFrame

In [6]:
# Item Structure: item_url, item_brand, item_name, n_loves, avg_stars, ingredients, skin_target, n_reviews, five_stars, four_stars, three_stars, two_stars, one_stars, review_corpus

In [7]:
# Create item dataframe
item_df = pd.DataFrame({
    'item_url':item_urls,
    'item_brand':[x[0] for x in item_collection],
    'item_name':[x[1] for x in item_collection],
    'n_loves':[x[2] for x in item_collection],
    'avg_stars':[x[3] for x in item_collection], 
    'ingredient':[x[4] for x in item_collection], 
    'skin_target':[x[5] for x in item_collection], 
    'n_reviews':[x[6] for x in item_collection], 
    'star_dist':[x[7] for x in item_collection],
    'review_corp':[x[8] for x in item_collection]
})

# brand and name remaps
item_names = item_df['item_name'].unique().tolist()
name_map = dict(zip(item_names, [f'item_{i}' for i in range(1, len(item_names) + 1)]))
item_brands = item_df['item_brand'].unique().tolist()
brand_map = dict(zip(item_brands, [f'brand_{i}' for i in range(1, len(item_brands) + 1)]))

In [8]:
# Remap items to numbers
item_df['item_name'] = item_df['item_name'].map(name_map)

# Remap brands to numbers
item_df['item_brand'] = item_df['item_brand'].map(brand_map)

In [9]:
# Add Granulated star distribution
item_df['five_stars'] = item_df['star_dist'].apply(lambda x: int(x[0].partition(':')[2]))
item_df['four_stars'] = item_df['star_dist'].apply(lambda x: int(x[1].partition(':')[2]))
item_df['three_stars'] = item_df['star_dist'].apply(lambda x: int(x[2].partition(':')[2]))
item_df['two_stars'] = item_df['star_dist'].apply(lambda x: int(x[3].partition(':')[2]))
item_df['one_stars'] = item_df['star_dist'].apply(lambda x: int(x[4].partition(':')[2]))

item_df.drop(columns=['star_dist'], inplace=True)

In [10]:
# Cast n_loves, n_reviews, avg_stars as int
item_df['n_loves'] = item_df['n_loves'].apply(lambda x: re_max(x, int))
item_df['n_reviews'] = item_df['n_reviews'].apply(lambda x: re_max(x, int))
item_df['avg_stars'] = item_df['avg_stars'].apply(lambda x: float(x.split()[0]))

In [11]:
# Append prices 
item_df['price'] = [float(x.partition('$')[-1]) for x in prices]

In [12]:
# Parse and Clean skin targets 
item_df['skin_target'] = item_df['skin_target'].apply(lambda x: [re_parse_str(x).lower() for x in x.split(' ') if re_parse_str(x)])

In [13]:
# Clean reviews corpus
item_df['review_corpus'] = item_df['review_corp'].apply(lambda x: [i.split('\n') for i in x])
item_df['review_corpus'] = item_df['review_corpus'].apply(lambda x: [parse_review_string(i) for i in x])

item_df.drop(columns=['review_corp'], inplace=True)

In [14]:
# Parse ingredients
item_df['ingredients'] = item_df['ingredient'].apply(lambda x: [x.strip().lower() for x in x.split(',')])
item_df.drop(columns=['ingredient'], inplace=True)

item_df['ingredients'][0]

['water/aqua/eau',
 'dicaprylyl carbonate',
 'glycerin',
 'cetearyl alcohol',
 'cetearyl olivate',
 'sorbitan olivate',
 'sclerocarya birrea seed oil',
 'bacillus/soybean/ folic acid ferment extract',
 'nymphaea alba root extract',
 'sh-oligopeptide-1',
 'sh-oligopeptide-2',
 'sh-polypeptide-1',
 'sh-polypeptide-9',
 'sh-polypeptide-11',
 'copper palmitoyl heptapeptide-14',
 'heptapeptide-15 palmitate',
 'palmitoyl tetrapeptide-7',
 'palmitoyl tripeptide-1',
 'alanine',
 'arginine',
 'glycine',
 'histidine',
 'isoleucine',
 'phenylalanine',
 'proline',
 'serine',
 'threonine',
 'valine',
 'acetyl glutamine',
 'coconut alkanes',
 'coco-caprylate/caprate',
 'sodium hyaluronate',
 'aspartic acid',
 'linoleic acid',
 'linolenic acid',
 'lecithin',
 'butylene glycol',
 'polyvinyl alcohol',
 'sodium lactate',
 'sodium pca',
 'pca',
 'sorbitan isostearate',
 'carbomer',
 'polysorbate 20',
 'polysorbate 60',
 'lactic acid/glycolic acid copolymer',
 'hydroxyethyl acrylate/sodium acryloyldimethy

In [15]:
# item_df.head()

## Create User DataFrame

In [16]:
# User Structure: item_url, item_brand, item_name ,user_id, user_meta, rating, review
user_collection[0][0] # id, meta, rating, reivew, brand, item_name

['harleen253',
 'Eye Color Brown\nHair color Black\nSkin Tone Medium\nSkin Type Oily',
 '5 stars',
 '3 h ago\nThis moisturizer has worked really well for my skin! I have combo skin and it doesn’t make me greasy. It keeps my skin plump and moisturized. I’ve gone through three of these now and how no intention of using any other moisturizer!\nRecommends this product',
 'Drunk Elephant',
 'Protini™ Polypeptide Moisturizer']

In [17]:
# Create user dataframe
user_df = pd.DataFrame({
    'user_id':[x[0] for product in user_collection for x in product],
    'user_meta':[x[1] for product in user_collection for x in product], 
    'star_rating':[x[2] for product in user_collection for x in product], 
    'review':[x[3] for product in user_collection for x in product],
    'item_brand':[x[4] for product in user_collection for x in product],
    'item_name':[x[5] for product in user_collection for x in product],
})

In [18]:
# Granulate and Clean User Data

# Cast star rating to int
user_df['star_rating'] = user_df['star_rating'].apply(lambda x: re_max(x, int) if type(x) != int else x)

# Remap items to numbers
user_df['item_name'] = user_df['item_name'].map(name_map)

# Remap brands to numbers
user_df['item_brand'] = user_df['item_brand'].map(brand_map)


In [19]:
# Create dict from user meta info
user_df['user_meta'] = user_df['user_meta'].apply(parse_user_meta)

In [20]:
# Granulate User Meta
user_df['age'] = user_df['user_meta'].apply(lambda x: x.get('age', None))
user_df['eye_color'] = user_df['user_meta'].apply(lambda x: x.get('eye color', None))
user_df['hair_color'] = user_df['user_meta'].apply(lambda x: x.get('hair color', None))
user_df['skin_tone'] = user_df['user_meta'].apply(lambda x: x.get('skin tone', None))
user_df['skin_type'] = user_df['user_meta'].apply(lambda x: x.get('skin type', None))

user_df.drop(columns=['user_meta'], inplace=True)

In [21]:
# Create timestamp element 
user_df['timestamp'] = user_df['review'].apply(lambda x: x.split('\n')[0])
# Convert and standardize all timestamps
user_df['timestamp'] = user_df['timestamp'].apply(create_timestamp)

In [22]:
# Create 'verified' feature
user_df['verified'] = user_df['review'].apply(lambda x: 1 if 'verified' in x.split('\n')[0].lower() else 0)

In [23]:
# Clean reviews corpus
reviews = user_df['review'].apply(lambda x: parse_review_string(x.split('\n')))
user_df['review'] = reviews

In [24]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   user_id      20000 non-null  object        
 1   star_rating  20000 non-null  int64         
 2   review       20000 non-null  object        
 3   item_brand   20000 non-null  object        
 4   item_name    20000 non-null  object        
 5   age          14432 non-null  object        
 6   eye_color    14432 non-null  object        
 7   hair_color   14432 non-null  object        
 8   skin_tone    14432 non-null  object        
 9   skin_type    14432 non-null  object        
 10  timestamp    20000 non-null  datetime64[ns]
 11  verified     20000 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(9)
memory usage: 1.8+ MB


In [25]:
user_df.head()

Unnamed: 0,user_id,star_rating,review,item_brand,item_name,age,eye_color,hair_color,skin_tone,skin_type,timestamp,verified
0,harleen253,5,This moisturizer has worked really well for my...,brand_1,item_1,,brown,black,medium,oily,2020-06-04 21:00:00,0
1,supremekaylap15,2,"Soooo at first when I started using this, it b...",brand_1,item_1,,hazel,brunette,medium,dry,2020-06-04 10:00:00,0
2,Alybug97,4,This is my go-to moisturizer. I have dry skin ...,brand_1,item_1,,brown,black,medium,combination,2020-06-04 09:00:00,0
3,Mikachew,2,I got this as part of the birthday gift and us...,brand_1,item_1,,,,,,2020-06-04 09:00:00,0
4,jkbean,2,Alright. I bought this because I loved the tri...,brand_1,item_1,,blue,blonde,fair,normal,2020-06-04 00:00:00,0


In [26]:
item_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   item_url       10 non-null     object 
 1   item_brand     10 non-null     object 
 2   item_name      10 non-null     object 
 3   n_loves        10 non-null     int64  
 4   avg_stars      10 non-null     float64
 5   skin_target    10 non-null     object 
 6   n_reviews      10 non-null     int64  
 7   five_stars     10 non-null     int64  
 8   four_stars     10 non-null     int64  
 9   three_stars    10 non-null     int64  
 10  two_stars      10 non-null     int64  
 11  one_stars      10 non-null     int64  
 12  price          10 non-null     float64
 13  review_corpus  10 non-null     object 
 14  ingredients    10 non-null     object 
dtypes: float64(2), int64(7), object(6)
memory usage: 1.3+ KB


In [27]:
item_df.head()

Unnamed: 0,item_url,item_brand,item_name,n_loves,avg_stars,skin_target,n_reviews,five_stars,four_stars,three_stars,two_stars,one_stars,price,review_corpus,ingredients
0,https://www.sephora.com/product/protini-tm-pol...,brand_1,item_1,259473,4.0,"[normal, dry, combination, and, oily]",4395,2557,540,402,405,491,68.0,"[Soooo at first when I started using this, it ...","[water/aqua/eau, dicaprylyl carbonate, glyceri..."
1,https://www.sephora.com/product/the-water-crea...,brand_2,item_2,233187,4.0,"[normal, oily, combination, sensitive]",3030,1817,392,307,232,282,68.0,[The gold spatula changed my life - I can't be...,"[water, saccharomyces/camellia sinensis leaf/c..."
2,https://www.sephora.com/product/the-true-cream...,brand_3,item_3,191092,4.5,"[normal, dry, combination, and, oily]",4527,3276,665,246,173,167,38.0,[Completely wrong for combo/oily skin. This ca...,"[water, dipropylene glycol, glycerin, methl tr..."
3,https://www.sephora.com/product/cactus-water-m...,brand_4,item_4,26641,4.0,"[normal, combination, and, oily]",2221,1263,538,178,108,134,38.0,[The BEST moisturizer if you’re trying to impr...,"[water, butylene glycol, glycerin, cyclopentas..."
4,https://www.sephora.com/product/dramatically-d...,brand_5,item_5,80124,4.5,"[normal, oily, combination, dry, sensitive]",2619,1721,565,151,85,97,46.0,[It has a great orangey smell and it did hydra...,[sources of vitamin c:]


In [28]:
user_df['review'][0]

'This moisturizer has worked really well for my skin! I have combo skin and it doesn’t make me greasy. It keeps my skin plump and moisturized. I’ve gone through three of these now and how no intention of using any other moisturizer!'

In [29]:
# Write to csv file
item_df.to_csv('data/item_df.csv')
user_df.to_csv('data/user_df.csv')