In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import seaborn as sns

In [25]:
df = pd.read_csv('../../../data/processed_data/full_df_positive_mental_provider.csv')
df.head()

Unnamed: 0,State FIPS Code,County FIPS Code,5-digit FIPS Code,State Abbreviation,Name,Release Year,County Ranked (Yes=1/No=0),Premature death raw value,Premature death numerator,Premature death denominator,...,April Average Precipitation,May Average Precipitation,June Average Precipitation,July Average Precipitation,August Average Precipitation,September Average Precipitation,October Average Precipitation,November Average Precipitation,December Average Precipitation,RUCC
0,1,1,1001,AL,Autauga County,2011,1.0,9967.4,675.0,,...,4.19,1.93,3.39,4.38,1.75,4.46,0.98,4.63,3.96,2.0
1,1,3,1003,AL,Baldwin County,2011,1.0,8321.8,2219.0,,...,1.28,1.68,3.15,8.73,2.05,9.89,0.15,3.22,2.78,3.0
2,1,15,1015,AL,Calhoun County,2011,1.0,11719.6,2106.0,,...,5.3,1.65,4.53,3.83,0.93,6.85,0.9,5.0,4.86,3.0
3,1,43,1043,AL,Cullman County,2011,1.0,9564.7,1245.0,,...,9.5,2.3,3.76,5.03,1.51,10.39,0.75,5.95,6.09,4.0
4,1,55,1055,AL,Etowah County,2011,1.0,11812.3,1905.0,,...,6.53,2.77,4.79,4.81,1.53,10.14,0.61,5.69,6.24,3.0


In [26]:
identifying_fields = ['State Abbreviation', 'Name', 'Release Year']
with open("../Regression Analysis/factors.txt") as f:
     factors = f.read().splitlines()
with open("../Regression Analysis/outcomes.txt") as f:
     outcomes = f.read().splitlines()
df = df[identifying_fields + factors + outcomes]
df.head()

Unnamed: 0,State Abbreviation,Name,Release Year,High school graduation raw value,Unemployment raw value,Some college raw value,Ratio of population to mental health providers,Median household income raw value,Average Temperature,January Average Temperature,...,June Average Precipitation,July Average Precipitation,August Average Precipitation,September Average Precipitation,October Average Precipitation,November Average Precipitation,December Average Precipitation,RUCC,Poor mental health days raw value,Crude Rate
0,AL,Autauga County,2011,0.74,0.089,0.554,50354.0,51622.0,64.658333,41.5,...,3.39,4.38,1.75,4.46,0.98,4.63,3.96,2.0,4.1,21.7
1,AL,Baldwin County,2011,0.681,0.088,0.615,6076.0,51957.0,67.733333,47.6,...,3.15,8.73,2.05,9.89,0.15,3.22,2.78,3.0,4.1,18.2
2,AL,Calhoun County,2011,0.689,0.101,0.499,9452.0,39997.0,62.325,39.1,...,4.53,3.83,0.93,6.85,0.9,5.0,4.86,3.0,4.8,14.4
3,AL,Cullman County,2011,0.684,0.094,0.46,20380.0,39276.0,61.333333,37.1,...,3.76,5.03,1.51,10.39,0.75,5.95,6.09,4.0,4.8,17.4
4,AL,Etowah County,2011,0.702,0.105,0.561,10344.0,37264.0,62.1,38.7,...,4.79,4.81,1.53,10.14,0.61,5.69,6.24,3.0,4.4,24.0


In [27]:
locations = list(range(9)) + [21] + list(range(34, 37))
df.iloc[:, locations].columns

Index(['State Abbreviation', 'Name', 'Release Year',
       'High school graduation raw value', 'Unemployment raw value',
       'Some college raw value',
       'Ratio of population to mental health providers',
       'Median household income raw value', 'Average Temperature',
       'Average Precipitation', 'RUCC', 'Poor mental health days raw value',
       'Crude Rate'],
      dtype='object')

In [28]:
# Assign into df variables
df_small = df.iloc[:, locations]
df_small.head()

Unnamed: 0,State Abbreviation,Name,Release Year,High school graduation raw value,Unemployment raw value,Some college raw value,Ratio of population to mental health providers,Median household income raw value,Average Temperature,Average Precipitation,RUCC,Poor mental health days raw value,Crude Rate
0,AL,Autauga County,2011,0.74,0.089,0.554,50354.0,51622.0,64.658333,3.769167,2.0,4.1,21.7
1,AL,Baldwin County,2011,0.681,0.088,0.615,6076.0,51957.0,67.733333,3.720833,3.0,4.1,18.2
2,AL,Calhoun County,2011,0.689,0.101,0.499,9452.0,39997.0,62.325,4.231667,3.0,4.8,14.4
3,AL,Cullman County,2011,0.684,0.094,0.46,20380.0,39276.0,61.333333,5.3825,4.0,4.8,17.4
4,AL,Etowah County,2011,0.702,0.105,0.561,10344.0,37264.0,62.1,4.965,3.0,4.4,24.0


In [29]:
# Create new column
def classify_temp_treatment(row, treatment, med, reverse):
    if reverse:
        return 0 if row[treatment] >= med else 1
    else:
        return 1 if row[treatment] >= med else 0

def create_new_column(df, treatment, categorize_treatment, reverse=False):
    med = df[treatment].median()
    df[categorize_treatment] = df.apply(classify_temp_treatment, args=(treatment, med, reverse), axis=1)

In [30]:
# try once
create_new_column(df_small, "High school graduation raw value", "high_school_treated")
create_new_column(df_small, "Unemployment raw value", "unemployment_treated", reverse=True)
create_new_column(df_small, "Some college raw value", "college_treated")
create_new_column(df_small, "Ratio of population to mental health providers", "mental_health_provider_treated", reverse=True)
create_new_column(df_small, "Median household income raw value", "household_income_treated")
create_new_column(df_small, "Average Temperature", "temperature_treated", reverse=True)
create_new_column(df_small, "Average Precipitation", "precipitation_treated", reverse=True)
df_small.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[categorize_treatment] = df.apply(classify_temp_treatment, args=(treatment, med, reverse), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[categorize_treatment] = df.apply(classify_temp_treatment, args=(treatment, med, reverse), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[

Unnamed: 0,State Abbreviation,Name,Release Year,High school graduation raw value,Unemployment raw value,Some college raw value,Ratio of population to mental health providers,Median household income raw value,Average Temperature,Average Precipitation,RUCC,Poor mental health days raw value,Crude Rate,high_school_treated,unemployment_treated,college_treated,mental_health_provider_treated,household_income_treated,temperature_treated,precipitation_treated
0,AL,Autauga County,2011,0.74,0.089,0.554,50354.0,51622.0,64.658333,3.769167,2.0,4.1,21.7,0,0,0,0,1,0,0
1,AL,Baldwin County,2011,0.681,0.088,0.615,6076.0,51957.0,67.733333,3.720833,3.0,4.1,18.2,0,0,1,0,1,0,0
2,AL,Calhoun County,2011,0.689,0.101,0.499,9452.0,39997.0,62.325,4.231667,3.0,4.8,14.4,0,0,0,0,0,0,0
3,AL,Cullman County,2011,0.684,0.094,0.46,20380.0,39276.0,61.333333,5.3825,4.0,4.8,17.4,0,0,0,0,0,0,0
4,AL,Etowah County,2011,0.702,0.105,0.561,10344.0,37264.0,62.1,4.965,3.0,4.4,24.0,0,0,0,0,0,0,0


In [31]:
df_small.head()

Unnamed: 0,State Abbreviation,Name,Release Year,High school graduation raw value,Unemployment raw value,Some college raw value,Ratio of population to mental health providers,Median household income raw value,Average Temperature,Average Precipitation,RUCC,Poor mental health days raw value,Crude Rate,high_school_treated,unemployment_treated,college_treated,mental_health_provider_treated,household_income_treated,temperature_treated,precipitation_treated
0,AL,Autauga County,2011,0.74,0.089,0.554,50354.0,51622.0,64.658333,3.769167,2.0,4.1,21.7,0,0,0,0,1,0,0
1,AL,Baldwin County,2011,0.681,0.088,0.615,6076.0,51957.0,67.733333,3.720833,3.0,4.1,18.2,0,0,1,0,1,0,0
2,AL,Calhoun County,2011,0.689,0.101,0.499,9452.0,39997.0,62.325,4.231667,3.0,4.8,14.4,0,0,0,0,0,0,0
3,AL,Cullman County,2011,0.684,0.094,0.46,20380.0,39276.0,61.333333,5.3825,4.0,4.8,17.4,0,0,0,0,0,0,0
4,AL,Etowah County,2011,0.702,0.105,0.561,10344.0,37264.0,62.1,4.965,3.0,4.4,24.0,0,0,0,0,0,0,0


In [32]:
df_small.to_csv('../../../data/processed_data/df_matching.csv', index=False)