In [1]:
import pandas as pd
import os
import glob
import time
import warnings

from functools import reduce
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
from sklearn.metrics import accuracy_score

# Packages for RFE (if we decide to try this out)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

# Additional pandas settings
pd.set_option('max_row', None)
warnings.filterwarnings("ignore")

In [2]:
# Set the working directory
os.chdir('/Users/aakritigupta/Desktop/Hackathon 2021/CSV Files')
os.getcwd()

'/Users/aakritigupta/Desktop/Hackathon 2021/CSV Files'

In [3]:
# Check to see if all files are being added to the list
all_files = glob.glob(os.getcwd() + "/*.csv")
# print(all_files)

In [4]:
# Function to read in the data and clean it up 
def readData(csvFile):
  # Read in dataset
  df = pd.read_csv(csvFile)

  # Extract YEAR out of column and store in new column
  df.SERIALNO = df.SERIALNO.map(str)
  df['YEAR'] = df.SERIALNO.str[:4]

  # Convert YEAR to datetime object and filter dataset to only keep 2016 and 2017 data
  df['YEAR'] = pd.to_datetime(df['YEAR']).dt.year
  df2 = df[(df['YEAR'] == 2016)]
  
  return df2

In [5]:
# Function to drop unnecessary columns
def dropCols(df):
  # Create a list of all columns that need to be dropped
  drop_cols = []

  # Add person weight indicators to the list
  for i in df.columns:
    if ('PWGTP' in i) & (len(i) >= 6) or  (i[0]=='F'):
      drop_cols.append(i)

  # Add the specific columns identified after walking through the dataset manually
  drop_cols.extend(['SERIALNO','POBP','RT','DIVISION','SPORDER','PUMA','RELSHIPP','ANC','ANC1P','ANC2P','QTRBIR','RAC2P','RAC3P','OC','RC','ENG','JWRIP','MARHYP','WKWN','YOEP','DECADE','DRIVESP',"JWAP","JWDP",'LANP','NAICSP','MIGPUMA','MIGSP','MSP','NOP','PAOC','POWPUMA','POWSP','SCIENGP','SCIENGRLP','SOCP','VPS','CITWP'])

  # Drop all columns from the dataframe
  df2 = df.drop(columns=drop_cols)

  # Drop duplicate rows
  df3 = df2.drop_duplicates()

  return df3

In [6]:
# Function to impute missing values - Numerical and Categorical features
def missingVals(df):
  # Create a list of numerical columns
  Numerical = ['PWGTP','AGEP','INTP','JWMNP','OIP','PAP','RETP','SEMP','SSIP','SSP','WAGP','WKHP','INDP','PERNP','PINCP','POVPIP']

  # For all missing values in that list, replace with the mean of the column
  for i in Numerical:
    df[i].fillna((df[i].mean()), inplace=True)
  
  # Create a list of categorical columns
  Categorical = []

  # For all missing values in categorical columns, enter category NoInput and convert dtype
  for j in df.columns:
    if j not in Numerical:
      df[j].fillna('NoInput', inplace=True)
      df[j] = df[j].astype('category')
      Categorical.append(j)
  
  return df 

In [7]:
# Function to encode categorical variables
# def oheCat(df, fileNum):
def oheCat(df):
  # Extract the year column from the dataframe
  date = df['YEAR']
  state = df['ST']

  # Drop the year column from the dataframe for encoding purposes
  df2 = df.drop(columns=['YEAR', 'ST'])

  # Create a list of all identified numerical variables
  Numerical = ['PWGTP','AGEP','INTP','JWMNP','OIP','PAP','RETP','SEMP','SSIP','SSP','WAGP','WKHP','INDP','PERNP','PINCP','POVPIP']

  # Create a list of all categorical columns
  cat = []
  for i in df2.columns:
    if (i not in Numerical):
      cat.append(i)
  
  # One hot encode all of the categorical variables
  df3 = pd.get_dummies(df2, prefix=cat)

  # Add the YEAR column back into the dataset
  df3['YEAR'] = date
  df3['ST'] = state

  # Write the final dataset back to the Google Drive folder
  # filePath = "/content/gdrive/MyDrive/Hackathon_2021/data/Processed_files/state_" + str(fileNum) + '.csv'
  # df1.to_csv(filePath)
  
  return df3

In [8]:
# Function to create the percentages in each column
def agg(df):
  # Create the list of numerical features
  Numerical = ['PWGTP','AGEP','INTP','JWMNP','OIP','PAP','RETP','SEMP','SSIP','SSP','WAGP','WKHP','INDP','PERNP','PINCP','POVPIP']
  cat = []

  # Create list of categorical features
  for i in df.columns:
    if (i not in Numerical) & (i != 'YEAR') & (i != 'ST'):
      cat.append(i)
    
    # Calculate mean of entire column for numerical features
    elif (i in Numerical) & (i != 'YEAR'):
      df[i] = df[i].mean()

  # Calculate a proportion of the categorical columns
  for j in cat:
    df[j] = df[j].sum()/len(df)

  # Remove duplicates
  df2 = df.drop_duplicates()
  
  return df2

In [16]:
# Function to iterate through all of the files and run each step of the process
def allFilesProcess():
  lstOfDfs = []
  for i, file in enumerate(os.listdir()):
    if ('.csv' in file) & (i > 40):
      data = readData(file)
      data2 = dropCols(data)
      data3 = missingVals(data2)
      data4 = oheCat(data3)
      data5 = agg(data4)
      
      # Add the df to the list 
      lstOfDfs.append(data5)
      print('Data ' + str(i) + ' has been added to the list')
  
  dfs = [df.reset_index() for df in lstOfDfs]
  dfs_final = pd.concat(dfs, axis=0)
  return dfs_final

In [10]:
%%time
finalDf1 = allFilesProcess()
# finalDf1.head()

Data 0 has been added to the list
Data 1 has been added to the list
Data 2 has been added to the list
Data 3 has been added to the list
Data 4 has been added to the list
Data 5 has been added to the list
Data 6 has been added to the list
Data 7 has been added to the list
Data 8 has been added to the list
Data 9 has been added to the list
Data 10 has been added to the list
Data 12 has been added to the list
Data 13 has been added to the list
Data 14 has been added to the list
Data 15 has been added to the list
Data 16 has been added to the list
Data 17 has been added to the list
Data 18 has been added to the list
Data 19 has been added to the list
Data 20 has been added to the list
CPU times: user 15min 53s, sys: 5min 38s, total: 21min 31s
Wall time: 33min 1s


In [11]:
finalDf1.head()

Unnamed: 0,index,PWGTP,AGEP,INTP,JWMNP,OIP,PAP,RETP,SEMP,SSIP,...,OCCP_9570.0,NWAV_4.0,OCCP_1440.0,OCCP_8850.0,OCCP_425.0,HISP_22,OCCP_2755.0,SFN_3.0,SFN_4.0,REGION_9
0,9017,21.271845,44.029691,2307.889181,25.665628,656.400684,26.495333,4254.678586,1462.680557,261.009596,...,,,,,,,,,,
0,67014,20.377785,41.082149,2415.002126,25.39359,686.907899,30.011075,2809.301454,1575.687775,266.610518,...,,,,,,,,,,
0,7869,18.989297,41.032875,3120.17058,18.794102,834.072462,23.730369,1677.891463,4925.406935,176.473332,...,0.000255,,,,,,,,,
0,118123,19.743318,42.230922,1898.920149,23.676768,582.110496,39.218149,2979.67434,1411.90591,303.551599,...,0.000103,1.7e-05,9e-06,0.000103,,,,,,
0,29605,20.207029,41.734004,1374.809518,21.930999,695.602654,22.465495,1987.355941,1321.301921,354.761027,...,3.4e-05,,,3.4e-05,,,,,,


In [12]:
finalDf1.shape

(20, 875)

In [14]:
%%time
finalDf2 = allFilesProcess()

Data 21 has been added to the list
Data 22 has been added to the list
Data 23 has been added to the list
Data 24 has been added to the list
Data 25 has been added to the list
Data 26 has been added to the list
Data 27 has been added to the list
Data 28 has been added to the list
Data 29 has been added to the list
Data 30 has been added to the list
Data 31 has been added to the list
Data 32 has been added to the list
Data 33 has been added to the list
Data 34 has been added to the list
Data 35 has been added to the list
Data 36 has been added to the list
Data 37 has been added to the list
Data 38 has been added to the list
Data 39 has been added to the list
Data 40 has been added to the list
CPU times: user 2min 23s, sys: 32.2 s, total: 2min 55s
Wall time: 2min 55s


In [15]:
finalDf2.shape

(20, 874)

In [17]:
%%time
finalDf3 = allFilesProcess()

Data 41 has been added to the list
Data 42 has been added to the list
Data 43 has been added to the list
Data 44 has been added to the list
Data 45 has been added to the list
Data 46 has been added to the list
Data 47 has been added to the list
Data 48 has been added to the list
Data 49 has been added to the list
Data 50 has been added to the list
Data 51 has been added to the list
Data 52 has been added to the list
CPU times: user 1min 42s, sys: 23.8 s, total: 2min 6s
Wall time: 2min 7s


In [18]:
finalDf3.shape

(12, 873)

In [26]:
# Function to combine all of the files together
def combineFiles(dfa, dfb, dfc):
    dfAll = pd.concat([dfa, dfb, dfc], axis=0)

    # Drop Puerto Rico from the dataset
    dfAll1 = dfAll.loc[(dfAll['ST'] != 72)]

    # Drop duplicates from the dataset
    dfAll1.drop_duplicates(inplace=True)

    return dfAll1

In [27]:
completeDf = combineFiles(finalDf1, finalDf2, finalDf3)
completeDf.head()

Unnamed: 0,index,PWGTP,AGEP,INTP,JWMNP,OIP,PAP,RETP,SEMP,SSIP,...,NWAV_4.0,OCCP_1440.0,OCCP_8850.0,OCCP_425.0,HISP_22,OCCP_2755.0,SFN_3.0,SFN_4.0,REGION_9,REGION_1
0,9017,21.271845,44.029691,2307.889181,25.665628,656.400684,26.495333,4254.678586,1462.680557,261.009596,...,,,,,,,,,,
0,67014,20.377785,41.082149,2415.002126,25.39359,686.907899,30.011075,2809.301454,1575.687775,266.610518,...,,,,,,,,,,
0,7869,18.989297,41.032875,3120.17058,18.794102,834.072462,23.730369,1677.891463,4925.406935,176.473332,...,,,,,,,,,,
0,118123,19.743318,42.230922,1898.920149,23.676768,582.110496,39.218149,2979.67434,1411.90591,303.551599,...,1.7e-05,9e-06,0.000103,,,,,,,
0,29605,20.207029,41.734004,1374.809518,21.930999,695.602654,22.465495,1987.355941,1321.301921,354.761027,...,,,3.4e-05,,,,,,,


In [28]:
completeDf.shape

(51, 876)

In [30]:
# Function to combine all of the additional datasets
def addNewData():
    # Read in all of the datasets
    pmh = pd.read_csv('/Users/aakritigupta/Desktop/Hackathon 2021/Poor_ Mental_health_Days_2016.csv')
    pmh.rename(columns={'STATE': 'State', 'VALUE':'pmh_Value'}, inplace=True)
    pmh.drop(columns=['RANK'], inplace=True)
    
    ob = pd.read_csv('/Users/aakritigupta/Desktop/Hackathon 2021/Obesity_2016.csv')
    ob['Value'] = ob['Value'].str.rstrip('%').astype('float') / 100.0
    ob.rename(columns={'Value': 'ob_Value'}, inplace=True)
    ob.drop(columns=['Rank'], inplace=True)
    
    isl = pd.read_csv('/Users/aakritigupta/Desktop/Hackathon 2021/Insufficient_sleep_2016.csv')
    isl['Value'] = isl['Value'].str.rstrip('%').astype('float') / 100.0
    isl.rename(columns={'Value': 'isl_Value'}, inplace=True)
    isl.drop(columns=['Rank '], inplace=True)
    
    fmd = pd.read_csv('/Users/aakritigupta/Desktop/Hackathon 2021/Frequent Mental Distress_2016.csv')
    fmd.rename(columns={'Value': 'fmd_Value'}, inplace=True)
    fmd.drop(columns=['Rank '], inplace=True)
    
    air = pd.read_csv('/Users/aakritigupta/Desktop/Hackathon 2021/Air_Pollution_2016.csv')
    air.rename(columns={'Value': 'air_Value'}, inplace=True)
    air.drop(columns=['Rank'], inplace=True)

    # Join the datasets on the State Name
    lst = [pmh, ob, isl, fmd, air]
    df_complete = reduce(lambda left, right: pd.merge(left, right, on='State'), lst)
        
    return df_complete

In [31]:
a = addNewData()
a.head()

Unnamed: 0,State,pmh_Value,ob_Value,isl_Value,fmd_Value,air_Value
0,South Dakota,2.4,0.304,0.278,0.071,6.3
1,Hawaii,2.9,0.227,0.44,0.088,7.0
2,Minnesota,2.9,0.261,0.289,0.087,8.0
3,Nebraska,2.9,0.314,0.3,0.089,7.3
4,Iowa,3.2,0.321,0.301,0.095,8.6


In [32]:
len(a['State'].unique())

52

In [47]:
# Function to add in the state initials to the dataset 
def addStInit(df, addDf):
    abbr = pd.read_csv('/Users/aakritigupta/Desktop/Hackathon 2021/ST_Abbr_Lkp.csv', header=None)
    abbr2 = abbr.rename(columns={0:'ST', 1:'State', 2:'LocationAbbr'})
    abbr3 = abbr2[(abbr2['ST'] != 72) | (abbr2['ST'] != 11)]

    # Join the abbreviations into the dataset
    df1 = df.merge(abbr3, how='left', on='ST')

    # Add the labels to the dataset
    label = pd.read_csv('/Users/aakritigupta/Desktop/Hackathon 2021/cdc_label_16.csv')
    label.drop(columns=['Unnamed: 0'], inplace=True)

    df2 = df1.merge(label, how='left', on='LocationAbbr')

    # Drop Puerto Rico from the dataset
    df2.drop((df2[df2['LocationAbbr'] == 'PR'].index) | (df2[df2['LocationAbbr'] == 'DC'].index), inplace=True)

    # Add in the additional data features to the dataset
    df3 = df2.merge(addDf, how='left', on='State')

    # Drop duplicates from the dataset
    df4 = df3.drop_duplicates()

    return df4

In [48]:
completeDf2 = addStInit(completeDf, a)
completeDf2.head()

Unnamed: 0,index,PWGTP,AGEP,INTP,JWMNP,OIP,PAP,RETP,SEMP,SSIP,...,REGION_9,REGION_1,State,LocationAbbr,Label,pmh_Value,ob_Value,isl_Value,fmd_Value,air_Value
0,9017,21.271845,44.029691,2307.889181,25.665628,656.400684,26.495333,4254.678586,1462.680557,261.009596,...,,,Delaware,DE,0.0,3.6,0.297,0.374,0.111,9.5
1,67014,20.377785,41.082149,2415.002126,25.39359,686.907899,30.011075,2809.301454,1575.687775,266.610518,...,,,Arizona,AZ,1.0,3.8,0.284,0.327,0.112,9.3
2,7869,18.989297,41.032875,3120.17058,18.794102,834.072462,23.730369,1677.891463,4925.406935,176.473332,...,,,North Dakota,ND,0.0,3.3,0.31,0.31,0.092,4.9
3,118123,19.743318,42.230922,1898.920149,23.676768,582.110496,39.218149,2979.67434,1411.90591,303.551599,...,,,Ohio,OH,1.0,3.9,0.298,0.371,0.12,10.2
4,29605,20.207029,41.734004,1374.809518,21.930999,695.602654,22.465495,1987.355941,1321.301921,354.761027,...,,,Arkansas,AR,1.0,4.7,0.345,0.364,0.149,7.5


In [50]:
completeDf2.shape

(50, 884)

In [51]:
# Function to impute missing values on aggregated dataset
def missVals(df):
    for i in df.columns:
        missingVals = df[i].isnull().sum()
        if missingVals > 0:
            df[i].fillna(0.0, inplace=True)
    
    return df

In [52]:
completeDf3 = missVals(completeDf2)
completeDf3.head()

Unnamed: 0,index,PWGTP,AGEP,INTP,JWMNP,OIP,PAP,RETP,SEMP,SSIP,...,REGION_9,REGION_1,State,LocationAbbr,Label,pmh_Value,ob_Value,isl_Value,fmd_Value,air_Value
0,9017,21.271845,44.029691,2307.889181,25.665628,656.400684,26.495333,4254.678586,1462.680557,261.009596,...,0.0,0.0,Delaware,DE,0.0,3.6,0.297,0.374,0.111,9.5
1,67014,20.377785,41.082149,2415.002126,25.39359,686.907899,30.011075,2809.301454,1575.687775,266.610518,...,0.0,0.0,Arizona,AZ,1.0,3.8,0.284,0.327,0.112,9.3
2,7869,18.989297,41.032875,3120.17058,18.794102,834.072462,23.730369,1677.891463,4925.406935,176.473332,...,0.0,0.0,North Dakota,ND,0.0,3.3,0.31,0.31,0.092,4.9
3,118123,19.743318,42.230922,1898.920149,23.676768,582.110496,39.218149,2979.67434,1411.90591,303.551599,...,0.0,0.0,Ohio,OH,1.0,3.9,0.298,0.371,0.12,10.2
4,29605,20.207029,41.734004,1374.809518,21.930999,695.602654,22.465495,1987.355941,1321.301921,354.761027,...,0.0,0.0,Arkansas,AR,1.0,4.7,0.345,0.364,0.149,7.5


## Feature Selection

In [54]:
# Function for feature selection
def featureSelect(df):
    df2 = df.drop(columns=['YEAR','ST','State', 'LocationAbbr', 'index'])

    # Split the dataset 
    df_len = df2.shape[1] - 1
    X = df2.iloc[:, 0:df_len]
    y = df2.iloc[:, df_len]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

    # Configure to select all features
    fs = SelectKBest(f_classif, k='all')

    # Learn Relationship from training data
    fs.fit(X_train, y_train)

    scores = fs.scores_

    df3 = pd.DataFrame()
    df3['Feature Name'] = X.columns
    df3['Score'] = scores

    # X.columns[scores.argsort()[-10:][::-1]]

    return df3

In [55]:
%%time
fs_exp1 = featureSelect(completeDf3)
fs_exp1.sort_values('Score', ascending=False)

# fs_exp1
# fs_exp1.head()

CPU times: user 14.3 ms, sys: 3.26 ms, total: 17.6 ms
Wall time: 22.3 ms


Unnamed: 0,Feature Name,Score
858,RACNUM_6,inf
868,OCCP_2755.0,inf
729,RACNH_1,1119.029912
728,RACNH_0,1119.029912
732,RACNUM_3,226.323039
857,RACNUM_5,200.024973
716,RAC1P_3,135.356327
719,RAC1P_7,122.642675
733,RACNUM_4,94.151753
757,WAOB_8,55.236166


In [63]:
# Function to get a list of all the features in which the score is greater than 20
def selectCols(df, dfb):
    # Only keep features with scores greater than 10 and append to a list
    df_fs = df.loc[df['Score'] >= 10]
    fsLst = df_fs['Feature Name'].tolist()

    # Only keep the identified columns in the training set
    train = dfb[fsLst]
    train['Label'] = dfb['Label']

    # Write the dataset to a CSV
    train.to_csv('/Users/aakritigupta/Desktop/Hackathon 2021/MentSea/MentSea/ANOVA_fs_train.csv')
    # return train

In [64]:
%%time
selectCols(fs_exp1, completeDf3)
# dfFs.head()

CPU times: user 5.95 ms, sys: 3.48 ms, total: 9.43 ms
Wall time: 14 ms


In [70]:
# Function for running Chi-Square Analysis
def chi2Func(df):
    df2 = df.drop(columns=['index','ST','LocationAbbr', 'State', 'YEAR', 'air_Value'])

    # Make sure that the label is the last column in the df
    df3 = df2[[c for c in df2 if c not in ['Label']] + ['Label']]

    # Break out the dataframe into features and target variable
    df_len = df3.shape[1] - 1
    X = df3.iloc[:, 0:df_len]
    y = df3.iloc[:, df_len]

    # Run the chi-square analysis
    fs = SelectKBest(score_func=chi2, k='all')
    fs.fit_transform(X, y)
    X_new = fs.transform(X)

    # Convert the array to a dataframe
    cat_cols_lst = X.columns.tolist()

    # Create a dataframe of results
    X_new_df = pd.DataFrame(data=fs.scores_, columns=['Chi2 Score'])
    X_new_df['Feature #'] = X_new_df.index
    X_new_df['Feature Name'] = cat_cols_lst
    X_new_df['P-Values'] = pd.Series(fs.pvalues_)

    # Create a new column for the deicison
    X_new_df.loc[X_new_df['P-Values'] <= 0.05, 'Decision'] = 'Dependent (Reject H0)'
    X_new_df.loc[X_new_df['P-Values'] > 0.05, 'Decision'] = 'Independent (Accept H0)'

    # Organize the columns
    cols = ['Feature #', 'Feature Name', 'Chi2 Score', 'P-Values', 'Decision']
    X_new_df = X_new_df[cols]

    # Only keep the features that have the decision as dependent
    X_new_df2 = X_new_df.loc[X_new_df['Decision'] == 'Dependent (Reject H0)']

    # Store all column names into a list
    lst = X_new_df2['Feature Name'].tolist()

    # Only keep the features from the list in the dataset
    df_fs = df[lst]
    df_fs['Label'] = df['Label']

    # Write the dataset to a CSV
    df_fs.to_csv('/Users/aakritigupta/Desktop/Hackathon 2021/MentSea/MentSea/chi2_fs_train.csv')

    # return df_fs

In [71]:
chi2Func(completeDf3)
# a = chi2Func(completeDf3)
# a.head()