# Basic EDA and Cleaning

In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('./dataset/raw_data.csv', index_col=[0])

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8649 entries, 0 to 8702
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Username      8649 non-null   object 
 1   Skin_Tone     8649 non-null   object 
 2   Skin_Type     8649 non-null   object 
 3   Eye_Color     8649 non-null   object 
 4   Hair_Color    8649 non-null   object 
 5   Rating_Stars  8649 non-null   int64  
 6   Review        8649 non-null   object 
 7   Product       8649 non-null   object 
 8   Brand         8649 non-null   object 
 9   Price         8649 non-null   int64  
 10  Rating        8649 non-null   float64
 11  Ingredients   8649 non-null   object 
 12  Category      8649 non-null   object 
 13  Product_Url   8649 non-null   object 
 14  User_id       8649 non-null   int64  
 15  Product_id    8649 non-null   int64  
dtypes: float64(1), int64(4), object(11)
memory usage: 1.1+ MB


In [6]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

### Clean the review and ingredient list and tokenize, lemmatize

In [7]:
STOPWORDS = set(stopwords.words('english'))
MIN_WORDS = 4
MAX_WORDS = 200

PATTERN_S = re.compile(r"'s")  # matches `'s` from text
PATTERN_NOT = re.compile("n't")  # matches `n't` from text
PATTERN_NT = re.compile("nt")  # matches `n't` from text
PATTERN_WILL = re.compile("'ll")  # matches `'ll` from text
PATTERN_ARE = re.compile("re")  # matches `re` from text
PATTERN_RN = re.compile(r"\r\\n") #matches `\r` and `\n`
PATTERN_URL = re.compile(r"http\S+") # remove url
PATTERN_HASH = re.compile(r"#") # remove hashtags
PATTERN_RT = re.compile("rt") # remove hashtags
PATTERN_AT = re.compile(r"@([a-zA-Z0-9_]{1,50})") # remove account name from retweeted tweet
PATTERN_PUNC = re.compile(r"[^\w\s]") # matches all non 0-9 A-z whitespace
PATTERN_AMP = re.compile(r"amp") # matches all non 0-9 A-z whitespace 


def clean_text(text):
    """
    Series of cleaning. String to lower case, remove non words characters and numbers (punctuation, curly brackets etc).
        text (str): input text
    return (str): modified initial text
    """
    text = text.lower()  # lowercase text
    text = re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
    
    # replace the matched string with ' '
    #text = re.sub(PATTERN_S, ' ', text)
    text = re.sub(PATTERN_NOT, ' not', text)
    text = re.sub(PATTERN_NT, ' not', text)
    #text = re.sub(PATTERN_WILL, 'will', text)
    text = re.sub(PATTERN_ARE, ' are', text)
    #text = re.sub(PATTERN_RN, ' ', text)
    #text = re.sub(PATTERN_URL, ' ', text)
    #text = re.sub(PATTERN_HASH, ' ', text)
    #text = re.sub(PATTERN_RT, ' ', text)
    #text = re.sub(PATTERN_AT, ' ', text)
    #text = re.sub(PATTERN_PUNC, ' ', text)
    text = re.sub(PATTERN_AMP, ' ', text)
    
    return text

def tok_lem(sentence, min_words=MIN_WORDS, max_words=MAX_WORDS, stopwords=STOPWORDS, lemmatize=True):
    """
    Lemmatize, tokenize, crop and remove stop words.
    Args:
      sentence (str)
      min_words (int)
      max_words (int)
      stopwords (set of string)
      lemmatize (boolean)
    returns:
      list of string
    """
    if lemmatize:
        stemmer = WordNetLemmatizer()
        tokens = [stemmer.lemmatize(w) for w in word_tokenize(sentence)]
    else:
        tokens = [w for w in word_tokenize(sentence)]
    token = [w for w in tokens if (len(w) > min_words and len(w) < max_words
                                                        and w not in stopwords)]
    
    return tokens    

def tokenizer(sentence, min_words=MIN_WORDS, max_words=MAX_WORDS):
    """
    Lemmatize, tokenize.
    Args:
      sentence (str)
      min_words (int)
      max_words (int)
    returns:
      list of string
    """
    stemmer = WordNetLemmatizer()
    tokens = [stemmer.lemmatize(w) for w in word_tokenize(sentence)]
    
    return tokens  

def clean_sentences(df):
    """
    Remove irrelavant characters (in new column clean_sentence).
    Lemmatize, tokenize words into list of words (in new column tok_lem_sentence).
    Args: 
      df (dataframe)
     returns:
      df
    """
    print('Cleaning ingredients...')
    df['Ingredients_Cleaned'] = df['Ingredients'].apply(clean_text)
    df['token_ingredients'] = df['Ingredients_Cleaned'].apply(
        lambda x: tokenizer(x, min_words=MIN_WORDS, max_words=MAX_WORDS))
    df['tok_lem_ingredients'] = df['Ingredients_Cleaned'].apply(
        lambda x: tok_lem(x, min_words=MIN_WORDS, max_words=MAX_WORDS, stopwords=STOPWORDS))
    
    print('Cleaning Done !')
    print('----------\n')
    
    print('Cleaning ingredients...')
    df['Review_Cleaned'] = df['Review'].apply(clean_text)
    df['token_Review'] = df['Review_Cleaned'].apply(
        lambda x: tokenizer(x, min_words=MIN_WORDS, max_words=MAX_WORDS))
    df['tok_lem_Review'] = df['Review_Cleaned'].apply(
        lambda x: tok_lem(x, min_words=MIN_WORDS, max_words=MAX_WORDS, stopwords=STOPWORDS))
    
    print('Cleaning Done !')
    print('----------\n')
    print(df.info())
    return df
    
df = clean_sentences(df)

Cleaning ingredients...
Cleaning Done !
----------

Cleaning ingredients...
Cleaning Done !
----------

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8649 entries, 0 to 8702
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Username             8649 non-null   object 
 1   Skin_Tone            8649 non-null   object 
 2   Skin_Type            8649 non-null   object 
 3   Eye_Color            8649 non-null   object 
 4   Hair_Color           8649 non-null   object 
 5   Rating_Stars         8649 non-null   int64  
 6   Review               8649 non-null   object 
 7   Product              8649 non-null   object 
 8   Brand                8649 non-null   object 
 9   Price                8649 non-null   int64  
 10  Rating               8649 non-null   float64
 11  Ingredients          8649 non-null   object 
 12  Category             8649 non-null   object 
 13  Product_Url          8649 non-null

In [8]:
df.head()

Unnamed: 0,Username,Skin_Tone,Skin_Type,Eye_Color,Hair_Color,Rating_Stars,Review,Product,Brand,Price,...,Category,Product_Url,User_id,Product_id,Ingredients_Cleaned,token_ingredients,tok_lem_ingredients,Review_Cleaned,token_Review,tok_lem_Review
0,allyp3,Medium,Combination,Brown,Brunette,5,This is hands down the best cleanser I’ve ever...,Superfood Antioxidant Cleanser,YOUTH TO THE PEOPLE,36,...,Cleanser,https://www.sephora.com/product/kale-spinach-g...,3420,157,water sodium cocoyl glutamate cocamidopropyl b...,"[water, sodium, cocoyl, glutamate, cocamidopro...","[water, sodium, cocoyl, glutamate, cocamidopro...",this is hands down the best cleanser ive ever ...,"[this, is, hand, down, the, best, cleanser, iv...","[this, is, hand, down, the, best, cleanser, iv..."
1,PatTea,Medium,Combination,Brown,Red,1,Unfortunately this doesn’t work for everyone. ...,Superfood Antioxidant Cleanser,YOUTH TO THE PEOPLE,36,...,Cleanser,https://www.sephora.com/product/kale-spinach-g...,2483,157,water sodium cocoyl glutamate cocamidopropyl b...,"[water, sodium, cocoyl, glutamate, cocamidopro...","[water, sodium, cocoyl, glutamate, cocamidopro...",unfortunately this does not work for everyone ...,"[unfortunately, this, doe, not, work, for, eve...","[unfortunately, this, doe, not, work, for, eve..."
2,Sabi1991,No data,No data,No data,No data,5,My favorite cleanser!! i love the packaging on...,Superfood Antioxidant Cleanser,YOUTH TO THE PEOPLE,36,...,Cleanser,https://www.sephora.com/product/kale-spinach-g...,2715,157,water sodium cocoyl glutamate cocamidopropyl b...,"[water, sodium, cocoyl, glutamate, cocamidopro...","[water, sodium, cocoyl, glutamate, cocamidopro...",my favorite cleanser i love the packaging on t...,"[my, favorite, cleanser, i, love, the, packagi...","[my, favorite, cleanser, i, love, the, packagi..."
3,happyface2,Fair,Dry,Blue,Blonde,5,I love all things Youth To The People! This cl...,Superfood Antioxidant Cleanser,YOUTH TO THE PEOPLE,36,...,Cleanser,https://www.sephora.com/product/kale-spinach-g...,4497,157,water sodium cocoyl glutamate cocamidopropyl b...,"[water, sodium, cocoyl, glutamate, cocamidopro...","[water, sodium, cocoyl, glutamate, cocamidopro...",i love all things youth to the people this cle...,"[i, love, all, thing, youth, to, the, people, ...","[i, love, all, thing, youth, to, the, people, ..."
4,kimkix34,Fair,Normal,Green,Blonde,5,I had a trial size of this and was obsessed. M...,Superfood Antioxidant Cleanser,YOUTH TO THE PEOPLE,36,...,Cleanser,https://www.sephora.com/product/kale-spinach-g...,5017,157,water sodium cocoyl glutamate cocamidopropyl b...,"[water, sodium, cocoyl, glutamate, cocamidopro...","[water, sodium, cocoyl, glutamate, cocamidopro...",i had a trial size of this and was obsessed my...,"[i, had, a, trial, size, of, this, and, wa, ob...","[i, had, a, trial, size, of, this, and, wa, ob..."


### Reclassify the Rating Stars column to a binary classification

In [12]:
df['Good_Stuff'] = df['Rating_Stars'].map({1: 0, 2: 0, 3: 0, 4: 0, 5: 1})

Only ratings with more than 4 stars is classified as good stuff because we want to get the best. 

In [15]:
df.head()

Unnamed: 0,Username,Skin_Tone,Skin_Type,Eye_Color,Hair_Color,Rating_Stars,Review,Product,Brand,Price,...,Product_Url,User_id,Product_id,Ingredients_Cleaned,token_ingredients,tok_lem_ingredients,Review_Cleaned,token_Review,tok_lem_Review,Good_Stuff
0,allyp3,Medium,Combination,Brown,Brunette,5,This is hands down the best cleanser I’ve ever...,Superfood Antioxidant Cleanser,YOUTH TO THE PEOPLE,36,...,https://www.sephora.com/product/kale-spinach-g...,3420,157,water sodium cocoyl glutamate cocamidopropyl b...,"[water, sodium, cocoyl, glutamate, cocamidopro...","[water, sodium, cocoyl, glutamate, cocamidopro...",this is hands down the best cleanser ive ever ...,"[this, is, hand, down, the, best, cleanser, iv...","[this, is, hand, down, the, best, cleanser, iv...",1
1,PatTea,Medium,Combination,Brown,Red,1,Unfortunately this doesn’t work for everyone. ...,Superfood Antioxidant Cleanser,YOUTH TO THE PEOPLE,36,...,https://www.sephora.com/product/kale-spinach-g...,2483,157,water sodium cocoyl glutamate cocamidopropyl b...,"[water, sodium, cocoyl, glutamate, cocamidopro...","[water, sodium, cocoyl, glutamate, cocamidopro...",unfortunately this does not work for everyone ...,"[unfortunately, this, doe, not, work, for, eve...","[unfortunately, this, doe, not, work, for, eve...",0
2,Sabi1991,No data,No data,No data,No data,5,My favorite cleanser!! i love the packaging on...,Superfood Antioxidant Cleanser,YOUTH TO THE PEOPLE,36,...,https://www.sephora.com/product/kale-spinach-g...,2715,157,water sodium cocoyl glutamate cocamidopropyl b...,"[water, sodium, cocoyl, glutamate, cocamidopro...","[water, sodium, cocoyl, glutamate, cocamidopro...",my favorite cleanser i love the packaging on t...,"[my, favorite, cleanser, i, love, the, packagi...","[my, favorite, cleanser, i, love, the, packagi...",1
3,happyface2,Fair,Dry,Blue,Blonde,5,I love all things Youth To The People! This cl...,Superfood Antioxidant Cleanser,YOUTH TO THE PEOPLE,36,...,https://www.sephora.com/product/kale-spinach-g...,4497,157,water sodium cocoyl glutamate cocamidopropyl b...,"[water, sodium, cocoyl, glutamate, cocamidopro...","[water, sodium, cocoyl, glutamate, cocamidopro...",i love all things youth to the people this cle...,"[i, love, all, thing, youth, to, the, people, ...","[i, love, all, thing, youth, to, the, people, ...",1
4,kimkix34,Fair,Normal,Green,Blonde,5,I had a trial size of this and was obsessed. M...,Superfood Antioxidant Cleanser,YOUTH TO THE PEOPLE,36,...,https://www.sephora.com/product/kale-spinach-g...,5017,157,water sodium cocoyl glutamate cocamidopropyl b...,"[water, sodium, cocoyl, glutamate, cocamidopro...","[water, sodium, cocoyl, glutamate, cocamidopro...",i had a trial size of this and was obsessed my...,"[i, had, a, trial, size, of, this, and, wa, ob...","[i, had, a, trial, size, of, this, and, wa, ob...",1


In [16]:
def basic_eda(df, df_name):
    print(df_name.upper())
    print()
    print(f"Rows: {df.shape[0]} \t Columns: {df.shape[1]}")
    print()
    
    print(f"Total null rows: {df.isnull().sum().sum()}")
    print(f"Percentage null rows: {round(df.isnull().sum().sum() / df.shape[0] * 100, 2)}%")
    print()
    
    print(f"Total duplicate rows: {df[df.duplicated(keep=False)].shape[0]}")
    print(f"Percentage dupe rows: {round(df[df.duplicated(keep=False)].shape[0] / df.shape[0] * 100, 2)}%")
    print()
    
    print(df.dtypes)
    print("-----\n")

In [18]:
basic_eda(df.drop(columns=['token_ingredients', 'tok_lem_ingredients', 'token_Review', 'tok_lem_Review']), 'df')

DF

Rows: 8649 	 Columns: 19

Total null rows: 0
Percentage null rows: 0.0%

Total duplicate rows: 2277
Percentage dupe rows: 26.33%

Username                object
Skin_Tone               object
Skin_Type               object
Eye_Color               object
Hair_Color              object
Rating_Stars             int64
Review                  object
Product                 object
Brand                   object
Price                    int64
Rating                 float64
Ingredients             object
Category                object
Product_Url             object
User_id                  int64
Product_id               int64
Ingredients_Cleaned     object
Review_Cleaned          object
Good_Stuff               int64
dtype: object
-----



In [19]:
df.to_csv('./dataset/clean_df.csv')