In [62]:
import pandas as pd
import numpy as np
import os
import glob
import time
import warnings

from functools import reduce

# Additional pandas settings
pd.set_option('max_row', None)
warnings.filterwarnings("ignore")

In [2]:
# Set the working directory
os.chdir('/Users/aakritigupta/Desktop/Hackathon 2021/CSV Files')
os.getcwd()

'/Users/aakritigupta/Desktop/Hackathon 2021/CSV Files'

In [3]:
# Function to read in the data and clean it up 
def readData(csvFile):
  # Read in dataset
  df = pd.read_csv(csvFile)

  # Extract YEAR out of column and store in new column
  df.SERIALNO = df.SERIALNO.map(str)
  df['YEAR'] = df.SERIALNO.str[:4]

  # Convert YEAR to datetime object and filter dataset to only keep 2016 and 2017 data
  df['YEAR'] = pd.to_datetime(df['YEAR']).dt.year
  df2 = df[(df['YEAR'] == 2017)]
  
  return df2

In [4]:
# Function to drop unnecessary columns
def dropCols(df):
  # Create a list of all columns that need to be dropped
  drop_cols = []

  # Add person weight indicators to the list
  for i in df.columns:
    if ('PWGTP' in i) & (len(i) >= 6) or  (i[0]=='F'):
      drop_cols.append(i)

  # Add the specific columns identified after walking through the dataset manually
  drop_cols.extend(['SERIALNO','POBP','RT','DIVISION','SPORDER','PUMA','RELSHIPP','ANC','ANC1P','ANC2P','QTRBIR','RAC2P','RAC3P','OC','RC','ENG','JWRIP','MARHYP','WKWN','YOEP','DECADE','DRIVESP',"JWAP","JWDP",'LANP','NAICSP','MIGPUMA','MIGSP','MSP','NOP','PAOC','POWPUMA','POWSP','SCIENGP','SCIENGRLP','SOCP','VPS','CITWP'])

  # Drop all columns from the dataframe
  df2 = df.drop(columns=drop_cols)

  # Drop duplicate rows
  df3 = df2.drop_duplicates()

  return df3

In [5]:
# Function to impute missing values - Numerical and Categorical features
def missingVals(df):
  # Create a list of numerical columns
  Numerical = ['PWGTP','AGEP','INTP','JWMNP','OIP','PAP','RETP','SEMP','SSIP','SSP','WAGP','WKHP','INDP','PERNP','PINCP','POVPIP']

  # For all missing values in that list, replace with the mean of the column
  for i in Numerical:
    df[i].fillna((df[i].mean()), inplace=True)
  
  # Create a list of categorical columns
  Categorical = []

  # For all missing values in categorical columns, enter category NoInput and convert dtype
  for j in df.columns:
    if j not in Numerical:
      df[j].fillna('NoInput', inplace=True)
      df[j] = df[j].astype('category')
      Categorical.append(j)
  
  return df 

In [6]:
# Function to encode categorical variables
# def oheCat(df, fileNum):
def oheCat(df):
  # Extract the year column from the dataframe
  date = df['YEAR']
  state = df['ST']

  # Drop the year column from the dataframe for encoding purposes
  df2 = df.drop(columns=['YEAR', 'ST'])

  # Create a list of all identified numerical variables
  Numerical = ['PWGTP','AGEP','INTP','JWMNP','OIP','PAP','RETP','SEMP','SSIP','SSP','WAGP','WKHP','INDP','PERNP','PINCP','POVPIP']

  # Create a list of all categorical columns
  cat = []
  for i in df2.columns:
    if (i not in Numerical):
      cat.append(i)
  
  # One hot encode all of the categorical variables
  df3 = pd.get_dummies(df2, prefix=cat)

  # Add the YEAR column back into the dataset
  df3['YEAR'] = date
  df3['ST'] = state

  # Write the final dataset back to the Google Drive folder
  # filePath = "/content/gdrive/MyDrive/Hackathon_2021/data/Processed_files/state_" + str(fileNum) + '.csv'
  # df1.to_csv(filePath)
  
  return df3

In [7]:
# Function to create the percentages in each column
def agg(df):
  # Create the list of numerical features
  Numerical = ['PWGTP','AGEP','INTP','JWMNP','OIP','PAP','RETP','SEMP','SSIP','SSP','WAGP','WKHP','INDP','PERNP','PINCP','POVPIP']
  cat = []

  # Create list of categorical features
  for i in df.columns:
    if (i not in Numerical) & (i != 'YEAR') & (i != 'ST'):
      cat.append(i)
    
    # Calculate mean of entire column for numerical features
    elif (i in Numerical) & (i != 'YEAR'):
      df[i] = df[i].mean()

  # Calculate a proportion of the categorical columns
  for j in cat:
    df[j] = df[j].sum()/len(df)

  # Remove duplicates
  df2 = df.drop_duplicates()
  
  return df2

In [14]:
# Function to iterate through all of the files and run each step of the process
def allFilesProcess():
  lstOfDfs = []
  for i, file in enumerate(os.listdir()):
    if ('.csv' in file) & (i > 40):
      data = readData(file)
      data2 = dropCols(data)
      data3 = missingVals(data2)
      data4 = oheCat(data3)
      data5 = agg(data4)
      
      # Add the df to the list 
      lstOfDfs.append(data5)
      print('Data ' + str(i) + ' has been added to the list')
  
  dfs = [df.reset_index() for df in lstOfDfs]
  dfs_final = pd.concat(dfs, axis=0)
  return dfs_final

In [10]:
%%time
valDf1 = allFilesProcess()
valDf1.shape

Data 0 has been added to the list
Data 1 has been added to the list
Data 2 has been added to the list
Data 3 has been added to the list
Data 4 has been added to the list
Data 5 has been added to the list
Data 6 has been added to the list
Data 7 has been added to the list
Data 8 has been added to the list
Data 9 has been added to the list
Data 10 has been added to the list
Data 12 has been added to the list
Data 13 has been added to the list
Data 14 has been added to the list
Data 15 has been added to the list
Data 16 has been added to the list
Data 17 has been added to the list
Data 18 has been added to the list
Data 19 has been added to the list
Data 20 has been added to the list
CPU times: user 4min 6s, sys: 1min 21s, total: 5min 27s
Wall time: 5min 40s


(20, 874)

In [13]:
%%time
valDf2 = allFilesProcess()
valDf2.shape

Data 21 has been added to the list
Data 22 has been added to the list
Data 23 has been added to the list
Data 24 has been added to the list
Data 25 has been added to the list
Data 26 has been added to the list
Data 27 has been added to the list
Data 28 has been added to the list
Data 29 has been added to the list
Data 30 has been added to the list
Data 31 has been added to the list
Data 32 has been added to the list
Data 33 has been added to the list
Data 34 has been added to the list
Data 35 has been added to the list
Data 36 has been added to the list
Data 37 has been added to the list
Data 38 has been added to the list
Data 39 has been added to the list
Data 40 has been added to the list
CPU times: user 6min 25s, sys: 1min 28s, total: 7min 53s
Wall time: 10min 5s


(20, 874)

In [15]:
%%time
valDf3 = allFilesProcess()
valDf3.shape

Data 41 has been added to the list
Data 42 has been added to the list
Data 43 has been added to the list
Data 44 has been added to the list
Data 45 has been added to the list
Data 46 has been added to the list
Data 47 has been added to the list
Data 48 has been added to the list
Data 49 has been added to the list
Data 50 has been added to the list
Data 51 has been added to the list
Data 52 has been added to the list
CPU times: user 4min 31s, sys: 1min 6s, total: 5min 37s
Wall time: 6min 41s


(12, 873)

In [16]:
# Function to combine all of the files together
def combineFiles(dfa, dfb, dfc):
    dfAll = pd.concat([dfa, dfb, dfc], axis=0)

    # Drop Puerto Rico from the dataset
    dfAll1 = dfAll.loc[(dfAll['ST'] != 72)]

    # Drop duplicates from the dataset
    dfAll1.drop_duplicates(inplace=True)

    return dfAll1

In [17]:
completeDf = combineFiles(valDf1, valDf2, valDf3)
completeDf.head()

Unnamed: 0,index,PWGTP,AGEP,INTP,JWMNP,OIP,PAP,RETP,SEMP,SSIP,...,OCCP_7830.0,OCCP_1440.0,OCCP_3946.0,OCCP_7850.0,OCCP_8720.0,OCCP_2755.0,RACNUM_6,REGION_9,REGION_1,SFN_4.0
0,17890,21.279364,43.966185,3012.90029,25.997637,798.568979,40.212345,4863.615141,1879.121604,205.842786,...,,,,,,,,,,
0,135412,20.392778,41.432338,2336.570893,25.936327,673.028343,30.037991,2995.878181,1465.639596,234.462468,...,,,,,,,,,,
0,15733,19.883765,41.187754,2831.627256,18.868179,746.455196,30.79496,1389.390168,4822.05476,162.881145,...,0.000127,,,,,,,,,
0,235549,19.66356,42.089175,1924.474268,23.882141,600.256237,37.568293,3040.171642,1439.858988,316.827275,...,3.4e-05,8e-06,3.4e-05,6.8e-05,0.000272,,,,,
0,59204,20.036922,41.716085,1460.244227,22.248918,669.91337,26.070967,2159.919587,1628.94849,369.174875,...,0.0001,0.0001,0.0001,0.000134,0.000167,,,,,


In [18]:
completeDf.shape

(51, 876)

In [19]:
# Function to combine all of the additional datasets
def addNewData():
    # Read in all of the datasets
    pmh = pd.read_csv('/Users/aakritigupta/Desktop/Hackathon 2021/Poor_ Mental_health_Days_2016.csv')
    pmh.rename(columns={'STATE': 'State', 'VALUE':'pmh_Value'}, inplace=True)
    pmh.drop(columns=['RANK'], inplace=True)
    
    ob = pd.read_csv('/Users/aakritigupta/Desktop/Hackathon 2021/Obesity_2016.csv')
    ob['Value'] = ob['Value'].str.rstrip('%').astype('float') / 100.0
    ob.rename(columns={'Value': 'ob_Value'}, inplace=True)
    ob.drop(columns=['Rank'], inplace=True)
    
    isl = pd.read_csv('/Users/aakritigupta/Desktop/Hackathon 2021/Insufficient_sleep_2016.csv')
    isl['Value'] = isl['Value'].str.rstrip('%').astype('float') / 100.0
    isl.rename(columns={'Value': 'isl_Value'}, inplace=True)
    isl.drop(columns=['Rank '], inplace=True)
    
    fmd = pd.read_csv('/Users/aakritigupta/Desktop/Hackathon 2021/Frequent Mental Distress_2016.csv')
    fmd.rename(columns={'Value': 'fmd_Value'}, inplace=True)
    fmd.drop(columns=['Rank '], inplace=True)
    
    air = pd.read_csv('/Users/aakritigupta/Desktop/Hackathon 2021/Air_Pollution_2016.csv')
    air.rename(columns={'Value': 'air_Value'}, inplace=True)
    air.drop(columns=['Rank'], inplace=True)

    # Join the datasets on the State Name
    lst = [pmh, ob, isl, fmd, air]
    df_complete = reduce(lambda left, right: pd.merge(left, right, on='State'), lst)
        
    return df_complete

In [22]:
newData = addNewData()
newData.head()

Unnamed: 0,State,pmh_Value,ob_Value,isl_Value,fmd_Value,air_Value
0,South Dakota,2.4,0.304,0.278,0.071,6.3
1,Hawaii,2.9,0.227,0.44,0.088,7.0
2,Minnesota,2.9,0.261,0.289,0.087,8.0
3,Nebraska,2.9,0.314,0.3,0.089,7.3
4,Iowa,3.2,0.321,0.301,0.095,8.6


In [33]:
# Function to add in the state initials to the dataset 
def addStInit(df, addDf):
    abbr = pd.read_csv('/Users/aakritigupta/Desktop/Hackathon 2021/ST_Abbr_Lkp.csv', header=None)
    abbr2 = abbr.rename(columns={0:'ST', 1:'State', 2:'LocationAbbr'})
    abbr3 = abbr2[(abbr2['ST'] != 72) | (abbr2['ST'] != 11)]

    # Join the abbreviations into the dataset
    df1 = df.merge(abbr3, how='left', on='ST')

    # Add the labels to the dataset
    label = pd.read_csv('/Users/aakritigupta/Desktop/Hackathon 2021/cdc_data_17.csv')
    label.drop(columns=['Unnamed: 0'], inplace=True)

    df2 = df1.merge(label, how='left', on='LocationAbbr')

    # Drop Puerto Rico from the dataset
    df2.drop((df2[df2['LocationAbbr'] == 'PR'].index) | (df2[df2['LocationAbbr'] == 'DC'].index), inplace=True)

    # Add in the additional data features to the dataset
    df3 = df2.merge(addDf, how='left', on='State')

    # Drop duplicates from the dataset
    df4 = df3.drop_duplicates()

    return df4

In [34]:
completeDf2 = addStInit(completeDf, newData)
completeDf2.head()

Unnamed: 0,index,PWGTP,AGEP,INTP,JWMNP,OIP,PAP,RETP,SEMP,SSIP,...,REGION_1,SFN_4.0,State,LocationAbbr,Label,pmh_Value,ob_Value,isl_Value,fmd_Value,air_Value
0,17890,21.279364,43.966185,3012.90029,25.997637,798.568979,40.212345,4863.615141,1879.121604,205.842786,...,,,Delaware,DE,1.0,3.6,0.297,0.374,0.111,9.5
1,135412,20.392778,41.432338,2336.570893,25.936327,673.028343,30.037991,2995.878181,1465.639596,234.462468,...,,,Arizona,AZ,0.0,3.8,0.284,0.327,0.112,9.3
2,15733,19.883765,41.187754,2831.627256,18.868179,746.455196,30.79496,1389.390168,4822.05476,162.881145,...,,,North Dakota,ND,0.0,3.3,0.31,0.31,0.092,4.9
3,235549,19.66356,42.089175,1924.474268,23.882141,600.256237,37.568293,3040.171642,1439.858988,316.827275,...,,,Ohio,OH,1.0,3.9,0.298,0.371,0.12,10.2
4,59204,20.036922,41.716085,1460.244227,22.248918,669.91337,26.070967,2159.919587,1628.94849,369.174875,...,,,Arkansas,AR,1.0,4.7,0.345,0.364,0.149,7.5


In [35]:
# Function to impute missing values on aggregated dataset
def missVals(df):
    for i in df.columns:
        missingVals = df[i].isnull().sum()
        if missingVals > 0:
            df[i].fillna(0.0, inplace=True)
    
    return df

In [36]:
valData3 = missVals(completeDf2)
valData3.head()

Unnamed: 0,index,PWGTP,AGEP,INTP,JWMNP,OIP,PAP,RETP,SEMP,SSIP,...,REGION_1,SFN_4.0,State,LocationAbbr,Label,pmh_Value,ob_Value,isl_Value,fmd_Value,air_Value
0,17890,21.279364,43.966185,3012.90029,25.997637,798.568979,40.212345,4863.615141,1879.121604,205.842786,...,0.0,0.0,Delaware,DE,1.0,3.6,0.297,0.374,0.111,9.5
1,135412,20.392778,41.432338,2336.570893,25.936327,673.028343,30.037991,2995.878181,1465.639596,234.462468,...,0.0,0.0,Arizona,AZ,0.0,3.8,0.284,0.327,0.112,9.3
2,15733,19.883765,41.187754,2831.627256,18.868179,746.455196,30.79496,1389.390168,4822.05476,162.881145,...,0.0,0.0,North Dakota,ND,0.0,3.3,0.31,0.31,0.092,4.9
3,235549,19.66356,42.089175,1924.474268,23.882141,600.256237,37.568293,3040.171642,1439.858988,316.827275,...,0.0,0.0,Ohio,OH,1.0,3.9,0.298,0.371,0.12,10.2
4,59204,20.036922,41.716085,1460.244227,22.248918,669.91337,26.070967,2159.919587,1628.94849,369.174875,...,0.0,0.0,Arkansas,AR,1.0,4.7,0.345,0.364,0.149,7.5


In [55]:
# Function to read in the dataset
def filterDf(df):
    # Read in the dataset
    fs = pd.read_csv('https://raw.githubusercontent.com/aagupta/MentalAid/main/stats_train.csv')
    
    # Drop the unnecessary columns
    fs2 = fs.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'Label_y'])
    fsLabel = fs['Label_y']
    state = df['State']

    # Create a list of important features
    fsCols = fs2.columns.tolist()

    # Subset the dataframe
    fs3 = df[fsCols]

    # Write the dataset
    fs3.to_csv('/Users/aakritigupta/Desktop/Hackathon 2021/MentSea/MentSea/Validation_2017.csv')

    return (fs3, fsLabel, state)

In [56]:
(valData4, label, state) = filterDf(valData3)
valData4.head()

Unnamed: 0,JWTRNS_8.0,HISP_2,HISP_4,HISP_21,HISP_24,OCCP_6240.0,OCCP_9830.0,RAC1P_3,RAC1P_7,RAC1P_9,...,OIP,PAP,RETP,SEMP,SSIP,SSP,WAGP,PERNP,PINCP,POVPIP
0,0.000448,0.030792,0.00168,0.0,0.001903,0.000448,0.001344,0.002239,0.00056,0.023738,...,798.568979,40.212345,4863.615141,1879.121604,205.842786,4434.998681,29790.25244,32049.812333,45025.512266,342.292926
1,0.001974,0.228045,0.001755,0.000395,0.009155,0.000936,0.000892,0.073387,0.002091,0.037673,...,673.028343,30.037991,2995.878181,1465.639596,234.462468,3658.658644,25751.907054,27638.036299,37146.183171,300.314163
2,0.000381,0.016006,0.000254,0.0,0.001397,0.000508,0.001778,0.046621,0.000889,0.018293,...,746.455196,30.79496,1389.390168,4822.05476,162.881145,3218.626322,27543.108276,32747.248229,40744.938239,336.262899
3,0.000424,0.013944,0.000662,0.000323,0.001613,0.000314,0.000204,0.00118,0.000458,0.023569,...,600.256237,37.568293,3040.171642,1439.858988,316.827275,3420.94177,26213.031916,28049.214735,36993.130397,312.937567
4,0.000535,0.041433,0.000234,0.0001,0.002506,0.000635,0.000601,0.004979,0.002038,0.028234,...,669.91337,26.070967,2159.919587,1628.94849,369.174875,3903.197562,20690.602051,22667.264103,30908.071129,277.760004


## Run code after predictions have been made

In [65]:
# Function to add label back into the dataset to compare
def addLabel(lst, stateLst):
    # Read in the dataset
    pred = pd.read_csv('/Users/aakritigupta/Desktop/Hackathon 2021/MentSea/MentSea/Predictions_2017.csv')

    # Drop unnecessary columns
    pred.drop(columns=['Unnamed: 0'], inplace=True)

    # Join the label to the dataset and the state column
    pred['Label'] = lst

    # Add a condition if values don't match
    pred['Match?'] = np.where(pred['Predictions'] != pred['Label'], 'False', 'True')

    # Add in the states to the dataset
    pred['State'] = state

    # Write df to csv
    pred.to_csv('/Users/aakritigupta/Desktop/Hackathon 2021/MentSea/MentSea/PredWLabel_2017.csv')

In [66]:
%%time
addLabel(label, state)

CPU times: user 12.8 ms, sys: 4.37 ms, total: 17.1 ms
Wall time: 19.6 ms
