In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model      import LinearRegression
from sklearn.metrics           import *
from sklearn.model_selection   import train_test_split

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [2]:
ewg = pd.read_csv("EWG_product.csv")

In [3]:
ewg.head()

Unnamed: 0,ingredient_score,data_availability,ingredient,ingredient_concerns,product_name,company,product_url,product_score
0,1,Robust,WATER,[],Lightly Foaming Vegan Face Wash for Normal/Sensitive Skin,Just the Goods,https://www.ewg.org/skindeep/products/765576-Just_the_Goods_Lightly_Foaming_Vegan_Face_Wash_for_NormalSensitive_Skin/,verified
1,1,Limited,POTASSIUM OLEATE,"['Multiple, additive exposure sources (low)']",Lightly Foaming Vegan Face Wash for Normal/Sensitive Skin,Just the Goods,https://www.ewg.org/skindeep/products/765576-Just_the_Goods_Lightly_Foaming_Vegan_Face_Wash_for_NormalSensitive_Skin/,verified
2,1,Fair,POTASSIUM COCOATE,"['Irritation (skin, eyes, or lungs) (low)']",Lightly Foaming Vegan Face Wash for Normal/Sensitive Skin,Just the Goods,https://www.ewg.org/skindeep/products/765576-Just_the_Goods_Lightly_Foaming_Vegan_Face_Wash_for_NormalSensitive_Skin/,verified
3,1,Good,GLYCERIN,[],Lightly Foaming Vegan Face Wash for Normal/Sensitive Skin,Just the Goods,https://www.ewg.org/skindeep/products/765576-Just_the_Goods_Lightly_Foaming_Vegan_Face_Wash_for_NormalSensitive_Skin/,verified
4,1,Limited,POTASSIUM CITRATE,[],Lightly Foaming Vegan Face Wash for Normal/Sensitive Skin,Just the Goods,https://www.ewg.org/skindeep/products/765576-Just_the_Goods_Lightly_Foaming_Vegan_Face_Wash_for_NormalSensitive_Skin/,verified


In [4]:
ewg = ewg.loc[ewg.product_score != 'verified'].copy()

In [5]:
ewg['ingredient_score'] = ewg['ingredient_score'].apply(float)
ewg['product_score'] = ewg['product_score'].apply(float)

In [8]:
product_overview = pd.DataFrame(ewg.groupby('product_name')['ingredient_score'].apply(list))

In [15]:
product_overview['product_score'] = ewg.groupby('product_name')['product_score'].apply(np.mean).apply(int)

In [16]:
product_overview['ingredient_count'] = product_overview['ingredient_score'].apply(lambda x: len(x))

In [17]:
product_overview['max_three'] = product_overview.ingredient_score.apply(lambda x: np.sort(x)[-3:])

In [18]:
result = []
for i in range(1, 9):
    result.append(list(product_overview.loc[product_overview.product_score==i].max_three.apply(np.mean)))

In [19]:
for r in result: 
    print(np.percentile(r, 10).round(2), '-', np.percentile(r, 90).round(2))

1.0 - 3.0
2.67 - 4.0
3.33 - 5.33
4.67 - 6.03
5.33 - 7.0
5.67 - 7.67
6.8 - 8.67
7.07 - 7.6


In [20]:
product_overview['max_three_mean'] = product_overview['max_three'].apply(np.mean)

In [21]:
for i in range(1, 10):
    product_overview[f'count_{i}'] = product_overview['ingredient_score'].apply(lambda x: x.count(i))


In [24]:
product_overview.head(2)

Unnamed: 0_level_0,ingredient_score,ingredient_count,max_three,product_score,max_three_mean,count_1,count_2,count_3,count_4,count_5,count_6,count_7,count_8,count_9
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
'That Hit Single' Gel Cream Cleanser,"[8.0, 4.0, 4.0, 3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",47,"[4.0, 4.0, 8.0]",4,5.333333,34,8,2,2,0,0,0,1,0
100% Virgin Coconut Oil Daily Hydration Face Milk Cleanser,"[8.0, 3.0, 3.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",30,"[3.0, 3.0, 8.0]",4,4.666667,24,3,2,0,0,0,0,1,0


In [42]:
y = product_overview['product_score']
X = product_overview.drop(['ingredient_score', 'product_score', 'max_three'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train)

In [43]:
lr = LinearRegression().fit(X_train, y_train)

In [44]:
from sklearn.metrics                     import f1_score
from sklearn.metrics                     import accuracy_score

In [45]:
mean_absolute_error(lr.predict(X_test).round(0), y_test)

0.35772357723577236

In [46]:
valid_f1 = f1_score(y_valid, lr.predict(X_valid), average='weighted')
valid_acc = accuracy_score(y_valid, lr.predict(X_valid))
valid_f1, valid_acc

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

In [272]:
pd.DataFrame([lr.predict(X_test).round(0), y_test]).T

Unnamed: 0,0,1
0,1.0,1.0
1,3.0,3.0
2,3.0,2.0
3,2.0,2.0
4,5.0,5.0
5,4.0,3.0
6,4.0,4.0
7,3.0,3.0
8,2.0,2.0
9,4.0,4.0
