### Updating the hpf_component table

In [2]:
import os
import pyodbc
import pandas as pd
import numpy as np
import sqlalchemy as sal

from sqlalchemy import text

## Connect to the data
Downloaded the 2021 access database for full data from food data central: https://fdc.nal.usda.gov/download-datasets.html

#### Make a DF of ndb and gtin_upc

In [3]:
cwd = os.getcwd()

cnxn = pyodbc.connect(r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=E:\OneDrive\Documents\Classes\DSE Capstone\Data\USDA branded food products database\BFPD\BFPD_07132018.accdb;')

query = '''SELECT NDB_Number, gtin_upc
FROM Products;
'''

ndb_gtin = pd.read_sql(query, cnxn)

cnxn.close()



In [4]:
ndb_gtin['NDB_Number'] = ndb_gtin['NDB_Number'].astype(int)

In [5]:
print(ndb_gtin.shape)
ndb_gtin.head()

(239089, 2)


Unnamed: 0,NDB_Number,gtin_upc
0,45001524,19022128593
1,45001528,5051379043735
2,45001529,5051379009434
3,45001530,5051379019969
4,45001531,5051379009526


#### Make a DF of fdc_id  and gtin_upc

In [6]:
cnxn = pyodbc.connect(r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=E:\OneDrive\Documents\Classes\DSE Capstone\Data\USDA branded food products database\FoodData Central\FoodData_Central_access.accdb;')

query = '''SELECT fdc_id, gtin_upc
FROM branded_food;
'''

fdc_gtin = pd.read_sql(query, cnxn)

cnxn.close()



In [7]:
print(fdc_gtin.shape)
fdc_gtin.head()

(373897, 2)


Unnamed: 0,fdc_id,gtin_upc
0,344646,38000162367
1,344922,84059100010
2,345313,10030100800337
3,345315,10030100800467
4,345321,10030100800344


In [8]:
fdc_gtin_ndb = fdc_gtin.merge(ndb_gtin, left_on = "gtin_upc", right_on = "gtin_upc", how = "inner")
fdc_gtin_ndb

Unnamed: 0,fdc_id,gtin_upc,NDB_Number
0,344646,00038000162367,45127070
1,344922,00084059100010,45127725
2,345313,10030100800337,45128624
3,345315,10030100800467,45128628
4,345321,10030100800344,45128640
...,...,...,...
239026,2127408,753656715339,45376397
239027,2127409,035046102500,45376401
239028,2182684,857451000314,45350274
239029,2183011,4710172030014,45090367


#### Connect to the nourish database and pull hpf_component data

In [9]:
nourish_user = "gmichael"

nourish_pswd = "567khcwx3s"

engine = sal.create_engine('postgresql+psycopg2://' + nourish_user + ':' + nourish_pswd + '@awesome-hw.sdsc.edu/nourish')
conn = engine.connect()

#### Pull the list of nutrients by product
Use array_agg to get a list of nutrients and values. Delimit by '|'.

In [10]:
query_nutrients = text('''select * from hpf_component''')

result = conn.execute(query_nutrients)

hpf_data = [i for i in result]

hpf_data[0:2]

[(45162472, False, True, False), (45085807, False, True, False)]

In [11]:
hpf_df = pd.DataFrame(hpf_data)
print(hpf_df.shape)
hpf_df.head()

(59040, 4)


Unnamed: 0,NDB_No,fsdo,fs,csdo
0,45162472,False,True,False
1,45085807,False,True,False
2,45351070,False,False,True
3,45248412,False,False,True
4,45152103,False,True,False


In [15]:
hpf_with_fdcid = hpf_df.merge(fdc_gtin_ndb, left_on = "NDB_No", right_on = "NDB_Number", how = "left")
hpf_with_fdcid.drop(columns = ["NDB_Number"], inplace = True)
hpf_with_fdcid.head()

Unnamed: 0,NDB_No,fsdo,fs,csdo,fdc_id,gtin_upc
0,45162472,False,True,False,1880619.0,809424900756
1,45085807,False,True,False,1864950.0,185255000231
2,45351070,False,False,True,1942022.0,846107018070
3,45248412,False,False,True,2087698.0,29000017986
4,45152103,False,True,False,1876804.0,66909100548


In [20]:
hpf_with_fdcid_noblanks = hpf_with_fdcid.dropna(subset = "fdc_id")
hpf_with_fdcid_noblanks['fdc_id'] = hpf_with_fdcid_noblanks['fdc_id'].astype(int)
hpf_with_fdcid_noblanks.to_csv('hpf_component_with_fdcid.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hpf_with_fdcid_noblanks['fdc_id'] = hpf_with_fdcid_noblanks['fdc_id'].astype(int)


In [13]:
sum(hpf_with_fdcid.fdc_id.isna())

9

#### Try to find the missing fdc_ids

In [18]:
ndb_missing_fdc = hpf_with_fdcid[hpf_with_fdcid.fdc_id.isna()]["NDB_No"].to_list()
#do a set because there are dupes
set(ndb_missing_fdc)

{45086702, 45157012, 45240564, 45282135, 45295897, 45304209}

In [16]:
ndb_gtin[ndb_gtin["NDB_Number"].isin(ndb_missing_fdc)]

Unnamed: 0,NDB_Number,gtin_upc
37308,45086702,25016673
74058,45157012,76410904108
145772,45240564,25016666
175847,45282135,76410902838
186639,45295897,3800031310
187062,45304209,3812008


In [21]:
gtins = ndb_gtin[ndb_gtin["NDB_Number"].isin(ndb_missing_fdc)]["gtin_upc"].to_list()