In [21]:
import pandas as pd
import numpy as np
import os

from sklearn.ensemble                    import RandomForestClassifier
from sklearn.linear_model                import LogisticRegression
from sklearn.model_selection             import RandomizedSearchCV
from sklearn.model_selection             import cross_validate

from sklearn.model_selection             import train_test_split
from sklearn.preprocessing               import OrdinalEncoder
from sklearn.feature_extraction.text     import TfidfVectorizer
from sklearn.feature_extraction.text     import TfidfTransformer
from sklearn.feature_extraction.text     import CountVectorizer
from sklearn.pipeline                    import Pipeline
from sklearn.compose                     import ColumnTransformer
from sklearn.impute                      import SimpleImputer

from sklearn.metrics                     import f1_score
from sklearn.metrics                     import accuracy_score
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

# Data Cleaning

In [22]:
# load data
ewg = pd.read_csv("EWG_product.csv")
# droping duplicates 
ewg = ewg.drop_duplicates()

In [23]:
ewg['ingredient_score'].unique()

array([1, 8, 6, 4, 3, 2, 5, 9, 7])

In [24]:
ewg['product_score'].unique()

array(['verified', '4', '2', '1', '3', '5', '6', '7', '8'], dtype=object)

In [25]:
# convert product_score to numerical target by converting 'verified' to 0.
ewg['product_score'] = ewg.apply(lambda x: 0 if x['product_score']=='verified' else int(x['product_score']), axis=1)
ewg['product_score'].unique()

array([0, 4, 2, 1, 3, 5, 6, 7, 8])

In [26]:
def feature_engineering(row_df):
    """ Input: row_df - ingredient level dataframe
        Output: df - product level dataframe
    """
    df = pd.DataFrame(row_df.groupby('product_name')['ingredient_score'].apply(list))
    df['product_score'] = row_df.groupby('product_name')['product_score'].apply(np.mean).apply(int)
    df['max_ingredient_score'] = df.ingredient_score.apply(lambda x: int(np.sort(x)[-1:]))
    df['ingredient_count'] = df['ingredient_score'].apply(lambda x: len(x))
    df['max_three'] = df.ingredient_score.apply(lambda x: np.sort(x)[-3:])
    df['max_three_mean'] = df['max_three'].apply(np.mean)
    
    # count the frequency of each ingredient_score value
    for i in range(1, 10):
        df[f'count_{i}'] = df['ingredient_score'].apply(lambda x: x.count(i))

    return df

In [27]:
product = feature_engineering(ewg)
product.head(2)

Unnamed: 0_level_0,ingredient_score,product_score,max_ingredient_score,ingredient_count,max_three,max_three_mean,count_1,count_2,count_3,count_4,count_5,count_6,count_7,count_8,count_9
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
'That Hit Single' Gel Cream Cleanser,"[8, 4, 4, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",4,8,47,"[4, 4, 8]",5.333333,34,8,2,2,0,0,0,1,0
100% Virgin Coconut Oil Daily Hydration Face Milk Cleanser,"[8, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",4,8,30,"[3, 3, 8]",4.666667,24,3,2,0,0,0,0,1,0


In [28]:
# look into the correlation between the product score and the mean of max_three
result = []
for i in range(0, 9):
    result.append(list(product.loc[product.product_score==i].max_three.apply(np.mean)))
    
for r in result: 
    print(np.percentile(r, 10).round(2), '-', np.percentile(r, 90).round(2))

1.0 - 3.67
1.0 - 3.0
2.67 - 4.0
3.33 - 5.33
4.67 - 6.07
5.33 - 7.0
5.67 - 7.67
6.8 - 8.67
7.07 - 7.6


# Train Test Split

In [29]:
features = ['ingredient_count', 'max_three_mean', 'count_1', 
            'count_2', 'count_3', 'count_4', 'count_5', 'count_6', 
            'count_7', 'count_8', 'count_9']
product.head(1)

Unnamed: 0_level_0,ingredient_score,product_score,max_ingredient_score,ingredient_count,max_three,max_three_mean,count_1,count_2,count_3,count_4,count_5,count_6,count_7,count_8,count_9
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
'That Hit Single' Gel Cream Cleanser,"[8, 4, 4, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",4,8,47,"[4, 4, 8]",5.333333,34,8,2,2,0,0,0,1,0


In [30]:
y = product['product_score']
X = product[features]

In [31]:
def narrow_y_categories(input_y):
    y_train_reduced = []
    for y in input_y:
        if y <= 2:
            y_train_reduced.append(1) 
        elif y <=6:
            y_train_reduced.append(2)
        else: 
            y_train_reduced.append(3)
    return y_train_reduced

y_reduced = narrow_y_categories(y)

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y_reduced)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train)

# Model

In [43]:
lf = LogisticRegression(max_iter=1000).fit(X_train, y_train)
train_f1 = f1_score(y_train, lf.predict(X_train), average='weighted')
train_acc = accuracy_score(y_train, lf.predict(X_train))
train_f1, train_acc

(0.9026162759467443, 0.9033333333333333)

In [44]:
valid_f1 = f1_score(y_valid, lf.predict(X_valid), average='weighted')
valid_acc = accuracy_score(y_valid, lf.predict(X_valid))
valid_f1, valid_acc

(0.905916832753086, 0.9054726368159204)

In [45]:
test_f1 = f1_score(y_test, lf.predict(X_test), average='weighted')
test_acc = accuracy_score(y_test, lf.predict(X_test))
test_f1, test_acc

(0.8525661256259637, 0.850187265917603)