# Modeling (Take 4):

In [26]:
import pandas as pd
import numpy as np

from sklearn.linear_model      import LinearRegression
from sklearn.metrics           import *
from sklearn.model_selection   import train_test_split
from sklearn.metrics           import r2_score
from sklearn.metrics           import accuracy_score
from sklearn.linear_model      import Ridge

import pickle
pd.set_option('display.max_colwidth', None)

from pathlib import Path
import sys
sys.path.append(str(Path.cwd().parents[0]))
import pandas as pd
import sqlalchemy as db
from sqlalchemy import create_engine, String, Integer
from config import Config

from fuzzy_match import match
from fuzzy_match import algorithims

### Accessing Amazon db 
for sample data to test on 

In [27]:
### From config.py
import os
# basedir = os.path.abspath(os.path.dirname(__file__))

class Config(object):
    SECRET_KEY = os.urandom(24) # For WTF forms
    SQLALCHEMY_DATABASE_URI = 'postgresql://consha_admin:consha8dev@msds603.cm9lzsru7xeh.us-west-2.rds.amazonaws.com/conshadb'
    SQLALCHEMY_TRACK_MODIFICATIONS = True # flash-login uses session which require a secret

conn = create_engine(Config.SQLALCHEMY_DATABASE_URI)

engine = db.create_engine(Config.SQLALCHEMY_DATABASE_URI)
connection = engine.connect()
metadata = db.MetaData()
Table = db.Table('amazon_product_500', metadata, schema='cached_data', autoload=True, autoload_with=engine) # why doesn't this work?

Sample Amazon Ingredient Input 

In [28]:
amz_df = pd.read_sql_table('amazon_product_500', con=connection, schema="cached_data", index_col=None)
sample_amz_ingredient_list = amz_df.ingredient[0].split(',')

### Next Step: converting those ingredients to scores by "looking up" each ingredient in the ewg database

In [44]:
import pandas as pd
import numpy as np

from sklearn.linear_model      import LinearRegression
from sklearn.metrics           import *
from sklearn.model_selection   import train_test_split
from sklearn.metrics           import r2_score
from sklearn.metrics           import accuracy_score
from sklearn.linear_model      import Ridge

import pickle
pd.set_option('display.max_colwidth', None)

from pathlib import Path
import sys
sys.path.append(str(Path.cwd().parents[0]))
import pandas as pd
import sqlalchemy as db
from sqlalchemy import create_engine, String, Integer
from config import Config

from fuzzy_match import match
from fuzzy_match import algorithims

from collections import Counter

import os
# basedir = os.path.abspath(os.path.dirname(__file__))

class Config(object):
    SECRET_KEY = os.urandom(24) # For WTF forms
    SQLALCHEMY_DATABASE_URI = 'postgresql://consha_admin:consha8dev@msds603.cm9lzsru7xeh.us-west-2.rds.amazonaws.com/conshadb'
    SQLALCHEMY_TRACK_MODIFICATIONS = True # flash-login uses session which require a secret

conn = create_engine(Config.SQLALCHEMY_DATABASE_URI)
engine = db.create_engine(Config.SQLALCHEMY_DATABASE_URI)

filename = 'finalized_LR_model.sav'
lr = pickle.load(open(filename, 'rb'))

def formatted_ewg_ingredient_score():
    """calls the ewg ingredient database and creates array of
    ingredient, score lists.The scores are calculated as the mean
    score for that ingredient,because some ingredients have multiple scores."""
    metadata = db.MetaData(schema='crawled_data')
    ewg = db.Table('ewg_product', metadata, autoload=True, autoload_with=engine)
    query = db.select([ewg])
    result = conn.execute(query).fetchall()

    ewg_df = pd.DataFrame(result, columns = ewg.columns.keys())

    # taking mean of score per ingredient as ingredient score
    ingredient_scores = ewg_df.groupby('ingredient')['ingredient_score'].apply(np.mean)
    ingredient_scores = np.array(pd.DataFrame(ingredient_scores).reset_index())
    return ingredient_scores

#### Fuzzy Matching to match ingredients from amazon with scores from ewg ####
def string_matching(ewg_string, amazon_ingredient):
    """Matches ewg ingredient strings with amazon ingredient strings."""
    match_score = algorithims.cosine(ewg_string.strip().lower(), amazon_ingredient.strip().lower())
    if match_score > .70:
        return True
    return False

def ingredient_string_matching(raw_amz_ingredient_list, ewg_ingredient_scores):
    """Input: raw list of ingredients from amazon - list[str] and
       numpy array of ewg ingredients and score pairs
       Output: list of scores from ewg database - float"""

    ingredient_score_list = []
    for ingredient in raw_amz_ingredient_list:
        for ewg_ingredient, ewg_score in ewg_ingredient_scores:
            if string_matching(ingredient, ewg_ingredient) == True:
                ingredient_score_list.append((ewg_score))
                break
    return ingredient_score_list

# second feature: max_three_mean
def max_three_scores(ingredient_scores):
    """Creating feature: mean of top three ingredients"""
    ingredient_scores.sort(reverse=True)
    return np.mean(ingredient_scores[:3])

# third feature(s): a count of each value - going to take int of each
def product_score_count(ingredient_scores):
    """Creating score count features"""
    integer_ingredient_scores = [int(i) for i in ingredient_scores]
    count_dictionary = Counter(integer_ingredient_scores)

    # filling in the gaps with 0s
    for i in range(1,10):
        if not count_dictionary.get(i):
            count_dictionary[i] = 0
    return count_dictionary

def combine_features(sample_amz_ingredient_list):
    ewg_ingredient_scores = formatted_ewg_ingredient_score()
    ingredient_scores = ingredient_string_matching(sample_amz_ingredient_list, ewg_ingredient_scores)
    # creating features
    max_three = max_three_scores(ingredient_scores)
    ingredient_count = len(sample_amz_ingredient_list)
    count_dictionary = product_score_count(ingredient_scores)
    # combining and formatting features for model
    x_df = pd.DataFrame(count_dictionary, index=[0])
    x_df['max_three'] = max_three
    x_df['ingredient_count'] = ingredient_count
    x_df.columns = ['ingredient_count', 'max_three_mean', 'count_1', 'count_2', 'count_3',
   'count_4', 'count_5', 'count_6', 'count_7', 'count_8', 'count_9']
    return x_df.values.reshape(1,-1)
    
def model(sample_amz_ingredient_list):
    return lr.predict(combine_features(sample_amz_ingredient_list))

sample_amz_ingredient_list = ['water','oil']
model(sample_amz_ingredient_list)

[[1.  1.  0.  0.  0.  0.  0.  0.  0.  1.5 2. ]]


array([2.55747064])

In [31]:
filename = 'finalized_LR_model.sav'
lr = pickle.load(open(filename, 'rb'))

In [34]:
lr.coef_

array([ 0.41694587,  0.54287262, -0.43370836, -0.30515408, -0.30457866,
       -0.29975584, -0.39117635, -0.45509677, -0.38973824,  0.44201264,
        0.13735211])

# Training

In [35]:
def clean_y(row): 
    """converts y to numerical target by converting 'verified' to 0."""
    if 'verified' in row: 
        return 0
    else:
        return row

In [36]:
# The training data 
ewg = pd.read_csv("/Users/michellejanneycoyle/Desktop/Crawling/Data/EWG_product.csv")
moisturizer_data = pd.read_csv("/Users/michellejanneycoyle/Desktop/Crawling/moisturizer_data.csv")
main_df = pd.concat([ewg, moisturizer_data])

main_df['ingredient_score'] = main_df['ingredient_score'].apply(float)
main_df['product_score'] = main_df.apply(lambda row: clean_y(row['product_score']), axis=1)
main_df['product_score'] = main_df['product_score'].apply(float)

product_overview = pd.DataFrame(main_df.groupby('product_name')['ingredient_score'].apply(list))
product_overview['product_score'] = main_df.groupby('product_name')['product_score'].apply(np.mean).apply(int)
product_overview['ingredient_count'] = product_overview['ingredient_score'].apply(lambda x: len(x))
product_overview['max_three'] = product_overview.ingredient_score.apply(lambda x: np.sort(x)[-3:])
product_overview['max_three_mean'] = product_overview['max_three'].apply(np.mean)
for i in range(1, 10):
    product_overview[f'count_{i}'] = product_overview['ingredient_score'].apply(lambda x: x.count(i))
y = product_overview['product_score']

X = product_overview.drop(['ingredient_score', 'product_score', 'max_three'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train)

In [37]:
lr = LinearRegression().fit(X_train, y_train)
MAE = mean_absolute_error(lr.predict(X_test), y_test)
R_squared = r2_score(lr.predict(X_test), y_test)
print(MAE, R_squared)

0.6394140185633118 0.7379192615184421


### Pickling the Model

In [41]:
filename = 'finalized_LR_model.sav'
pickle.dump(lr, open(filename, 'wb'))

In [39]:
filename = 'finalized_LR_model.sav'
lr = pickle.load(open(filename, 'rb'))