In [836]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import re
from collections import defaultdict

In [837]:
#Use one ./ if running from run_notebooks.py
#Use two ../if running directly from Jupyter Notebooks
#file_beg = '../NHANES-Downloader/data/csv_data/'

file_beg = '../NHANES-Downloader/data/csv_data/'

In [838]:
#Import all the files
files1 = glob.glob(file_beg+'1999-2000/*/*.csv')
files2 = glob.glob(file_beg+'2001-2002/*/*.csv')
files3 = glob.glob(file_beg+'2003-2004/*/*.csv')
files4 = glob.glob(file_beg+'2005-2006/*/*.csv')
files5 = glob.glob(file_beg+'2007-2008/*/*.csv')
files6 = glob.glob(file_beg+'2009-2010/*/*.csv')
files7 = glob.glob(file_beg+'2011-2012/*/*.csv')
files8 = glob.glob(file_beg+'2013-2014/*/*.csv')
files9 = glob.glob(file_beg+'2015-2016/*/*.csv')
files10 = glob.glob(file_beg+'2017-2018/*/*.csv')

In [839]:
#Add files into a list
file_list = [files1, files2, files3, files4, files5, 
          files6, files7, files8, files9, files10]

In [840]:
#Add files into a list of sorted and dictionaries
file_list_dict = []
for x in file_list:
    x.sort()
    file_list_dict.append(dict(enumerate(x)))

In [841]:
smq_indx = [96, 113, 127, 121, 122, 129, 133, 162, 128, 185]



In [842]:
dfs = defaultdict(int)
for i in range(0,len(smq_indx)):
    dfs[i] = pd.read_csv(file_list_dict[i][smq_indx[i]])

## Functions: Recategorize values, Count Values, Drop Rows

In [843]:
#Recategorize function
def recategorize(df, name, replace_dict):
    df[name].replace(
    to_replace=replace_dict,
    inplace=True
)
    
#Count values function
def count_vals(df, name):
    df_count = df.groupby(name)['SEQN'].nunique()
    print(df_count,"\n\n","NaN: ", df[name].isnull().sum())
    
#Drop rows that include certain values
def drop_rows(df, name, val_list):
    df.drop(df[df[name].isin(val_list)].index, inplace=True)

## Important Variables

In [844]:
#1999-2000
var_names = ["SEQN", "SMD680", "SMAQUEX"]

#2001-2012
var_names1 = ["SEQN", "SMQ680", "SMAQUEX"]

#2013-2018
var_names2 = ["SEQN", "SMQ681", "SMAQUEX"]

In [845]:
#To display all columns in Jupyter Notebooks
pd.set_option('display.max_columns', 500)

## Make a copy of the dataframes

In [846]:
#1999-2018
df0 = dfs[0][var_names].copy() #1999-2000
df1 = dfs[1][var_names1].copy() #2001-2002
df2 = dfs[2][var_names1].copy() #2003-2004
df3 = dfs[3][var_names1].copy() #2005-2006
df4 = dfs[4][var_names1].copy() #2007-2008
df5 = dfs[5][var_names1].copy() #2009-2010
df6 = dfs[6][var_names1].copy() #2011-2012
df7 = dfs[7][var_names2].copy() #2013-2014

df8 = dfs[8][var_names2].copy() #2015-2016
df9 = dfs[9][var_names2].copy() #2017-2018

## Recategorize values

In [847]:
#No recategorizing is needed

## Rename columns 1999 - 2000; 2013 - 2018

In [848]:
#Col names
col_names = var_names1

In [849]:
#1999 - 2000
df0.columns = col_names
#2013 - 2018
df7.columns = col_names
df8.columns = col_names
df9.columns = col_names

## Append years 1999 - 2018

In [850]:
years = ["1999-2000","2001-2002","2003-2004","2005-2006","2007-2008", 
        "2009-2010","2011-2012","2013-2014","2015-2016","2017-2018"]

In [851]:
frames = [df0, df1, df2, df3, df4, df5, df6, df7, df8, df9]

In [852]:
#Add years as a column
for i, df in enumerate(frames):
    df["Year"] = years[i]

In [853]:
result_1999_2018 = pd.concat(frames, keys = years)

In [854]:
result_1999_2018_cleaned = result_1999_2018.copy()

## Count values for each column

In [855]:
len(col_names)

3

In [856]:
#SMQ680 - Used tobacco/nicotine last 5 days?
count_vals(result_1999_2018, col_names[1])

SMQ680
1.0    13570
2.0    49594
7.0        9
9.0        6
Name: SEQN, dtype: int64 

 NaN:  5953


In [857]:
#SMAQUEX - Questionnaire Mode Flag
count_vals(result_1999_2018, col_names[2])

SMAQUEX
1.0    15869
2.0    53263
Name: SEQN, dtype: int64 

 NaN:  0


In [858]:
result_1999_2018[:1]

Unnamed: 0,Unnamed: 1,SEQN,SMQ680,SMAQUEX,Year
1999-2000,0,2.0,2.0,2.0,1999-2000


In [859]:
len(result_1999_2018)

69132

## Remove missing values:

In [860]:
drop_rows(result_1999_2018_cleaned, col_names[1], [np.nan, 7, 9])

## See if missing values have been correctly removed:

In [861]:
count_vals(result_1999_2018_cleaned, col_names[1])

SMQ680
1.0    13570
2.0    49594
Name: SEQN, dtype: int64 

 NaN:  0


In [862]:
before = len(result_1999_2018)
before

69132

In [863]:
after = len(result_1999_2018_cleaned)
after

63164

In [864]:
(before-after)/after

0.09448419986068013

## MongoDB Insertion

In [865]:
#Import MongoClient
from pymongo import MongoClient

#Create a MongoClient to run the MongoDB instance
client = MongoClient('localhost', 27017)

In [866]:
#Connect to existing datbase
db = client.NHANES

In [867]:
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'NHANES')

In [868]:
db.list_collection_names()

['hiq',
 'alq',
 'drxtot',
 'smq',
 'bpq',
 'bmx',
 'huq',
 'bpx',
 'mcq_h',
 'paq',
 'demo_p',
 'mcq_a',
 'mcq_b',
 'rdq',
 'demo',
 'diq',
 'mcq_c']

In [869]:
#Create collection in database
smq = db.smq

In [870]:
#If collections exist, then drop
if 'smq' in db.list_collection_names():
    smq.drop()
    db.list_collection_names()
else:
    print("Collection doesn't exist yet")

## Create new collection to input into database

In [871]:
result_1999_2018_cleaned[:3]

Unnamed: 0,Unnamed: 1,SEQN,SMQ680,SMAQUEX,Year
1999-2000,0,2.0,2.0,2.0,1999-2000
1999-2000,1,5.0,1.0,2.0,1999-2000
1999-2000,2,6.0,2.0,1.0,1999-2000


In [872]:
#Set SEQN as _id (Primary Key)
result_1999_2018_cleaned.rename(columns={'SEQN':'_id'}, inplace=True)

In [873]:
#Dataframe to dictionary
smq_dict = result_1999_2018_cleaned.to_dict(orient='records')

In [874]:
smq_dict[0]

{'_id': 2.0, 'SMQ680': 2.0, 'SMAQUEX': 2.0, 'Year': '1999-2000'}

In [875]:
#Insert collection 
smq.insert_many(smq_dict)

<pymongo.results.InsertManyResult at 0x15915c5b0>

In [876]:
#View collections
db.list_collection_names()

['hiq',
 'alq',
 'drxtot',
 'smq',
 'bpq',
 'bmx',
 'huq',
 'bpx',
 'mcq_h',
 'paq',
 'demo_p',
 'mcq_a',
 'mcq_b',
 'rdq',
 'demo',
 'diq',
 'mcq_c']

In [877]:
# for s in smq.find():
#     print(s)