# Data Cleaning

In [10]:
import pandas as pd
import numpy as np
import os

from sklearn.ensemble                    import RandomForestClassifier
from sklearn.linear_model                import LogisticRegression
from sklearn.model_selection             import RandomizedSearchCV
from sklearn.model_selection             import cross_validate

from sklearn.model_selection             import train_test_split
from sklearn.preprocessing               import OrdinalEncoder
from sklearn.feature_extraction.text     import TfidfVectorizer
from sklearn.feature_extraction.text     import TfidfTransformer
from sklearn.feature_extraction.text     import CountVectorizer
from sklearn.pipeline                    import Pipeline
from sklearn.compose                     import ColumnTransformer
from sklearn.impute                      import SimpleImputer

from sklearn.metrics                     import f1_score
from sklearn.metrics                     import accuracy_score

In [11]:
PATH = "EWG_product.csv"
product_df = pd.read_csv(PATH)

In [12]:
# droping duplicates 
product_df = product_df.drop_duplicates()

# renaming columns 
product_df.columns = ['ingredient_score', 'data_availability', 'ingredient', 'ingredient_concerns', 'product_name', 'company', 'product_url', 'product_score']

# Train Test Split

In [4]:
y = product_df['product_score'].copy()
X = product_df.drop('product_score', axis=1).copy()

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train)

# Clean for Modeling  

### Step 1: Encode y (i.e. change verified to 0)

In [7]:
def clean_y(raw_y): 
    """converts y to numerical target by converting 'verified' to 0."""
    y_clean_list = []
    for i in raw_y:
        if 'verified' in i: 
            y_clean_list.append(0)
        else:
            y_clean_list.append(int(i))
    return y_clean_list

### Step 2: Clean X

In [8]:
# encoding availability  
availability_pipe = Pipeline([('availability_encoder', OrdinalEncoder(categories = [['None', 'Limited', 'Fair', 'Good', 'Robust']], handle_unknown = 'use_encoded_value', unknown_value = -1))])
# encoding company
company_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                     ('company_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])

# combining preprocessing steps into one 
preprocessing = ColumnTransformer([('availability_encoder', availability_pipe, ['data_availability']),
                                   ('company_encoder', company_pipe, ['company'])])

pipe = Pipeline([('preprocessing', preprocessing)])

# fitting the ordinal encoders on the training data only 
pipe.fit(X_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('availability_encoder',
                                                  Pipeline(steps=[('availability_encoder',
                                                                   OrdinalEncoder(categories=[['None',
                                                                                               'Limited',
                                                                                               'Fair',
                                                                                               'Good',
                                                                                               'Robust']],
                                                                                  handle_unknown='use_encoded_value',
                                                                                  unknown_value=-1))]),
                                                  ['data_

In [9]:
def change_ingredient_score(X):
    """Takes median score for each ingredient and finds mean ingredient score for each product. 
    Returns dataframe."""
    # find median score per ingredient
    X['median_ingredient_score'] = X.groupby(by='ingredient')['ingredient_score'].transform('median')
    # calculate the mean of product scores for each product
    X['mean_product_score'] = X.groupby(by=['product_name'])['median_ingredient_score'].transform('mean')
    return X

def clean_X(X):
    """Takes in X predictor dataframe, returns cleaned X predictor dataframe"""
    X = X.copy()
    X[['encoded_availability', 'encoded_company']] = pipe.transform(X)
    X = change_ingredient_score(X)
    return X

# preprocessing 
y_train = clean_y(y_train)
X_train = clean_X(X_train)
y_valid = clean_y(y_valid)
X_valid = clean_X(X_valid)
y_test = clean_y(y_test)
X_test = clean_X(X_test)

KeyError: "None of [Index(['encoded_availability', 'encoded_company'], dtype='object')] are in the [columns]"

# Modeling Time 

## First Model: Logistic Regression
#### simplest version with mean product score as only predictor

Performance on Training Data

In [142]:
lf = LogisticRegression().fit(pd.DataFrame(X_train['mean_product_score']), y_train)
train_predictions = lf.predict(pd.DataFrame(X_train['mean_product_score']))
training_f1 = f1_score(y_train, train_predictions, average='weighted')
training_acc = accuracy_score(y_train, train_predictions)

In [136]:
training_f1, training_acc

(0.31512412305879295, 0.35578377517713555)

Performance on Validation Data

In [137]:
valid_predictions = lf.predict(pd.DataFrame(X_valid['mean_product_score']))
valid_f1 = f1_score(y_valid, valid_predictions, average='weighted')
valid_acc = accuracy_score(y_valid, valid_predictions)
valid_f1, valid_acc

(0.23218642837163506, 0.25262655205348616)

### Try fewer y's: Three Categories 
#### might be the better choice for our personal score to be transparent about how our schools are less informed than those from EWG

We can get a much better score if we reduce the number of categories 

In [138]:
def narrow_y_categories(input_y):
    y_train_reduced = []
    for y in input_y:
        if y <= 2:
            y_train_reduced.append(1) 
        elif y <=6:
            y_train_reduced.append(2)
        else: 
            y_train_reduced.append(3)
    return y_train_reduced
y_train = narrow_y_categories(y_train)
y_valid = narrow_y_categories(y_valid)

In [139]:
lf = LogisticRegression().fit(pd.DataFrame(X_train['mean_product_score']), y_train_reduced)
train_predictions = lf.predict(pd.DataFrame(X_train['mean_product_score']))
training_f1 = f1_score(y_train_reduced, train_predictions, average='weighted')
training_acc = accuracy_score(y_train_reduced, train_predictions)

Performance on training data

In [140]:
training_f1, training_acc

(0.5694329495981436, 0.6946103017275694)

performance on test data 

In [141]:
valid_predictions = lf.predict(pd.DataFrame(X_valid['mean_product_score']))
valid_f1 = f1_score(y_valid, valid_predictions, average='weighted')
valid_acc = accuracy_score(y_valid, valid_predictions)
valid_f1, valid_acc

(0.5819118095176152, 0.7041547277936963)