# This notebook transforms the nutrients product data into a matrix for clustering

### Connecting to the postgresql server

In [1]:
import pandas as pd
import numpy as np
import sqlalchemy as sal

from sqlalchemy import text

In [None]:
nutrient_matrix_csv_p = r"../../data/Nutrients_Branded_Foods_2018/nutrients_matrix.csv.gz"
nutrients_csv = r"../../data/Nutrients.csv"

#### Connect to the database

In [2]:
nourish_user = ""

nourish_pswd = ""

engine = sal.create_engine('postgresql+psycopg2://' + nourish_user + ':' + nourish_pswd + '@awesome-hw.sdsc.edu/nourish')
conn = engine.connect()

#### Pull the list of nutrients by product
Deviating from profs recommendation and delimiting by '|' since some column names have ':' in it

In [None]:
query_nutrients = text('''SELECT "NDB_No", array_agg("Nutrient_name"||'|'||"Output_value" order by "Nutrient_name")
from "Nutrients_Branded_Foods_2018"
group by "NDB_No"''')

result = conn.execute(query_nutrients)

nutrient_data = [i for i in result]

nutrient_data[0:2]

#### Create a pandas dataframe and reshape it

In [None]:
nutrient_df = pd.DataFrame(nutrient_data)
nutrient_df.head(3)

Create a function to convert the list to a dictionary

In [None]:
def convert_to_dict(lst):
    output = {}
    for key_value in lst:
        key, value = key_value.split('|', 1)
        #handle duplicates by checking if key value already exists and taking average
        #not common with this data, but there are a few instances
        if key in output:
            output[key] = (output[key] + float(value))/2
        else:
            output[key] = float(value)
    return output

Update the array_agg column in the nutrients dataframe

In [None]:
nutrient_df['array_agg'] = nutrient_df['array_agg'].apply(lambda x: convert_to_dict(x))

In [None]:
nutrient_df.head(3)

Use pd.json_normalize to pivot the dataframe on the nutrient column. Inspiration:

https://stackoverflow.com/questions/38231591/split-explode-a-column-of-dictionaries-into-separate-columns-with-pandas

In [None]:
nutrients_pivoted = pd.json_normalize(nutrient_df['array_agg'])
nutrients_pivoted.fillna(0, inplace = True)
nutrients_pivoted.head()

Merge the original df with the pivoted df, and drop the array_agg column

In [None]:
nutrient_df = nutrient_df.merge(nutrients_pivoted, left_index = True, right_index = True)
del nutrient_df["array_agg"]
nutrient_df.head()

Export the matrix to compressed csv

In [None]:
nutrient_df.to_csv(nutrient_csv_p, 
                   index = False, compression = "gzip")

# If the database is down, you can import the file locally
Must download the file from https://data.nal.usda.gov/dataset/usda-branded-food-products-database

In [4]:
nutrient_df = pd.read_csv(nutrients_csv, usecols= ["NDB_No", "Nutrient_name", "Output_value"])

In [5]:
nutrient_df.head(3)

Unnamed: 0,NDB_No,Nutrient_name,Output_value
0,45127487,Protein,4.3
1,45127487,Total lipid (fat),18.9
2,45127487,"Carbohydrate, by difference",72.8


In [6]:
#convert categorical to make pivoting faster
nutrient_df["Nutrient_name"] = nutrient_df["Nutrient_name"].astype("category")

#### Before pivoting this data, it's important to note there are a few duplicate counts for product / nutrient

In [7]:
counts = nutrient_df.groupby(by = ["NDB_No", "Nutrient_name"], as_index = False).count()
over_counted_nutrients = counts[counts['Output_value']>1].copy(deep = True)
over_counted_nutrients.rename(columns = {"Output_value": "Count_of_Output_value"}, inplace = True)
over_counted_nutrients

Unnamed: 0,NDB_No,Nutrient_name,Count_of_Output_value
3910331,45095515,Energy,2
7339545,45162398,Energy,2
9451631,45189128,Energy,2
14849863,45262237,Energy,2
19535199,45343319,Energy,2
21169201,45360957,Energy,2


Take the average the over counted nutrients

In [8]:
nutrient_df = nutrient_df.pivot_table(index="NDB_No", columns = "Nutrient_name",
                    fill_value = 0,
                    values = "Output_value",
                    aggfunc ={"Output_value": np.mean})

In [9]:
nutrient_df.shape

(237837, 94)

In [10]:
nutrient_df.head()

Nutrient_name,10:0,12:0,"18:2 n-6 c,c",18:2 undifferentiated,"18:3 n-3 c,c,c (ALA)",8:0,Alanine,"Alcohol, ethyl",Arginine,Ash,...,Vitamin D,Vitamin D (D2 + D3),Vitamin D3 (cholecalciferol),Vitamin E,Vitamin E (alpha-tocopherol),Vitamin E (label entry primarily),Vitamin K (phylloquinone),Water,Xylitol,"Zinc, Zn"
NDB_No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45001524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0.0
45001528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0.0
45001529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0.0
45001530,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0.0
45001531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0.0


Export the matrix to csv

In [11]:
nutrient_df.reset_index().to_csv(nutrient_matrix_csv_p,
                                 index = False, compression = "gzip")