In [157]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import re
from collections import defaultdict

In [158]:
#Use one ./ if running from run_notebooks.py
#Use two ../if running directly from Jupyter Notebooks
#file_beg = '../NHANES-Downloader/data/csv_data/'

file_beg = '../NHANES-Downloader/data/csv_data/'

In [159]:
#Import all the files
files1 = glob.glob(file_beg+'1999-2000/*/*.csv')
files2 = glob.glob(file_beg+'2001-2002/*/*.csv')
files3 = glob.glob(file_beg+'2003-2004/*/*.csv')
files4 = glob.glob(file_beg+'2005-2006/*/*.csv')
files5 = glob.glob(file_beg+'2007-2008/*/*.csv')
files6 = glob.glob(file_beg+'2009-2010/*/*.csv')
files7 = glob.glob(file_beg+'2011-2012/*/*.csv')
files8 = glob.glob(file_beg+'2013-2014/*/*.csv')
files9 = glob.glob(file_beg+'2015-2016/*/*.csv')
files10 = glob.glob(file_beg+'2017-2018/*/*.csv')

In [160]:
file_list = [files1, files2, files3, files4, files5, 
          files6, files7, files8, files9, files10]

In [161]:
#Add files into a list of sorted and dictionaries
file_list_dict = []
for x in file_list:
    x.sort()
    file_list_dict.append(dict(enumerate(x)))

In [162]:
mcq_indx = [81, 99, 112, 108, 110, 117, 121, 150, 117, 136]

In [163]:
dfs = defaultdict(int)
for i in range(0,len(mcq_indx)):
    dfs[i] = pd.read_csv(file_list_dict[i][mcq_indx[i]])

## Functions: Recategorize values, Count Values, Drop Rows

In [164]:
#Recategorize function
def recategorize(df, name, replace_dict):
    df[name].replace(
    to_replace=replace_dict,
    inplace=True
)
    
#Count values function
def count_vals(df, name):
    df_count = df.groupby(name)['SEQN'].nunique()
    print(df_count,"\n\n","NaN: ", df[name].isnull().sum())
    
#Drop rows that include certain values
def drop_rows(df, name, val_list):
    df.drop(df[df[name].isin(val_list)].index, inplace=True)

## Important Variables

In [165]:
#1999-2018
var_names = ["SEQN", "MCQ010"]

In [166]:
#To display all columns in Jupyter Notebooks
#pd.set_option('display.max_columns', 500)

## Make a copy of the dataframes

In [167]:
#1999-2016
df0 = dfs[0][var_names].copy() #1999-2000
df1 = dfs[1][var_names].copy() #2001-2002
df2 = dfs[2][var_names].copy() #2003-2004
df3 = dfs[3][var_names].copy() #2005-2006
df4 = dfs[4][var_names].copy() #2007-2008
df5 = dfs[5][var_names].copy() #2009-2010
df6 = dfs[6][var_names].copy() #2011-2012
df7 = dfs[7][var_names].copy() #2013-2014
df8 = dfs[8][var_names].copy() #2015-2016
df9 = dfs[9][var_names].copy() #2017-2018

## Recategorize values

In [168]:
#No recategorizing needed

## Rename columns 1999-2000

In [169]:
col_names = var_names
#No renaming needed

## Append years 1999 - 2018

In [170]:
years = ["1999-2000","2001-2002","2003-2004","2005-2006","2007-2008", 
        "2009-2010","2011-2012","2013-2014","2015-2016","2017-2018"]

In [171]:
frames = [df0, df1, df2, df3, df4, df5, df6, df7, df8, df9]

In [172]:
#Add years as a column
for i, df in enumerate(frames):
    df["Year"] = years[i]

In [173]:
result_1999_2018 = pd.concat(frames, keys = years)

In [174]:
result_1999_2018_cleaned = result_1999_2018.copy()

## Count values for each column

In [175]:
result_1999_2018_cleaned[:0]

Unnamed: 0,Unnamed: 1,SEQN,MCQ010,Year


In [176]:
len(col_names)

2

In [177]:
#MCQ010 - Ever been told you have asthma
count_vals(result_1999_2018_cleaned, col_names[1])

MCQ010
1.0    13773
2.0    82923
7.0        5
9.0      107
Name: SEQN, dtype: int64 

 NaN:  3


In [178]:
result_1999_2018[:1]

Unnamed: 0,Unnamed: 1,SEQN,MCQ010,Year
1999-2000,0,1.0,2.0,1999-2000


In [179]:
len(result_1999_2018)

96811

## Remove missing values:

In [180]:
drop_rows(result_1999_2018_cleaned, col_names[1], [np.nan, 7, 9])

## See if missing values have been correctly removed:

In [181]:
count_vals(result_1999_2018_cleaned, col_names[1])

MCQ010
1.0    13773
2.0    82923
Name: SEQN, dtype: int64 

 NaN:  0


In [182]:
before = len(result_1999_2018)
before

96811

In [183]:
after = len(result_1999_2018_cleaned)
after

96696

In [184]:
(before-after)/after

0.0011892942831140895

## MongoDB Insertion

In [185]:
#Import MongoClient
from pymongo import MongoClient

#Create a MongoClient to run the MongoDB instance
client = MongoClient('localhost', 27017)

In [186]:
#Connect to existing datbase
db = client.NHANES

In [187]:
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'NHANES')

In [188]:
db.list_collection_names()

['hiq',
 'alq',
 'drxtot',
 'bpq',
 'bmx',
 'huq',
 'bpx',
 'demo_p',
 'mcq_a',
 'demo',
 'diq']

In [189]:
#Create collection in database
mcq_a = db.mcq_a

In [190]:
#If collections exist, then drop
if 'mcq_a' in db.list_collection_names():
    mcq_a.drop()
    db.list_collection_names()
else:
    print("Collections doesn't exist yet")

## Create new collection to input into database

In [191]:
result_1999_2018_cleaned[:3]

Unnamed: 0,Unnamed: 1,SEQN,MCQ010,Year
1999-2000,0,1.0,2.0,1999-2000
1999-2000,1,2.0,2.0,1999-2000
1999-2000,2,3.0,2.0,1999-2000


In [192]:
#Set SEQN as _id (Primary Key)
result_1999_2018_cleaned.rename(columns={'SEQN':'_id'}, inplace=True)

In [193]:
#Dataframe to dictionary
mcq_a_dict = result_1999_2018_cleaned.to_dict(orient='records')

In [194]:
mcq_a_dict[0]

{'_id': 1.0, 'MCQ010': 2.0, 'Year': '1999-2000'}

In [195]:
#Insert collection 
mcq_a.insert_many(mcq_a_dict)

<pymongo.results.InsertManyResult at 0x1203e0910>

In [196]:
#View collections
db.list_collection_names()

['hiq',
 'alq',
 'drxtot',
 'bpq',
 'bmx',
 'huq',
 'bpx',
 'demo_p',
 'mcq_a',
 'demo',
 'diq']

In [197]:
# for m in mcq_a.find():
#     print(m)