In [41]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

# repeated printouts
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [40]:
demographics_aggression_problems = pd.read_pickle('../data/demographics_aggression_problems.pkl')
demographics_problems_merged = pd.read_pickle('../data/clean_presented_problems.pkl')
ny_counties = pd.read_pickle('../data/ny_counties.pkl')

In [21]:
demographics_aggression_problems.presented_problems.unique()
demographics_problems_merged.presented_problems.value_counts()

aggression (physical, verbal, property destruction, threats)    3929
family needs assistance                                         2498
mental health symptoms                                          2352
self-injurious                                                  1438
decrease in ability to participate in daily functions           1238
diagnosis and treatment plan assistance                         1215
leaving unexpectedly                                            1074
at risk of losing placement                                      923
suicidal ideation                                                731
sexualized behavior                                              595
transition from hospital                                         401
suicidal action                                                   13
Name: presented_problems, dtype: int64

In [52]:
demographics_aggression_problems.disability_level_ordered.value_counts()

Mild                   1777
Moderate               1286
Severe                  328
Normal intelligence     271
Borderline              210
Profound                 56
Name: disability_level_ordered, dtype: int64

In [53]:
demographics_problems_merged['is_aggression'] = np.where(demographics_problems_merged.presented_problems == "aggression (physical, verbal, property destruction, threats)", 1, 0)
demographics_problems_merged['is_aggression'].value_counts()

0    12478
1     3929
Name: is_aggression, dtype: int64

In [54]:
demographics_problems_merged.head()

Unnamed: 0,Local ID,state_demographics,enrollment_date,enrollment_ym_derived,enrollment_year,disability_level,disability_level_ordered,presented_problems,Gender,Ethnicity,County,Gender_cat,is_aggression
0,000083W,north carolina,2018-06-27,2018-06-01,2018,Mild,Mild,"aggression (physical, verbal, property destruc...",Female,Not of Hispanic origin,Swain,Female,1
1,000083W,north carolina,2018-06-27,2018-06-01,2018,Mild,Mild,decrease in ability to participate in daily fu...,Female,Not of Hispanic origin,Swain,Female,0
2,000083W,north carolina,2018-06-27,2018-06-01,2018,Mild,Mild,leaving unexpectedly,Female,Not of Hispanic origin,Swain,Female,0
3,000083W,north carolina,2018-06-27,2018-06-01,2018,Mild,Mild,mental health symptoms,Female,Not of Hispanic origin,Swain,Female,0
4,1021487,texas,2020-03-02,2020-03-01,2020,Mild,Mild,"aggression (physical, verbal, property destruc...",Male,Not of Hispanic origin,Tarrant,Male,1


In [55]:
model = smf.logit("is_aggression ~ C(disability_level_ordered, Treatment('Normal intelligence'))", data = demographics_problems_merged).fit()

print(model.summary())

Optimization terminated successfully.
         Current function value: 0.549546
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:          is_aggression   No. Observations:                16406
Model:                          Logit   Df Residuals:                    16400
Method:                           MLE   Df Model:                            5
Date:                Mon, 13 Mar 2023   Pseudo R-squ.:                0.001580
Time:                        18:49:30   Log-Likelihood:                -9015.9
converged:                       True   LL-Null:                       -9030.1
Covariance Type:            nonrobust   LLR p-value:                 2.869e-05
                                                                                  coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------------------------

In [66]:
model = smf.logit("is_aggression ~ C(disability_level_ordered, Treatment('Normal intelligence')) + C(Gender_cat, Treatment('Male')) + C(state_demographics, Treatment('new york'))", data = demographics_problems_merged).fit()

print(model.summary())

Optimization terminated successfully.
         Current function value: 0.546977
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:          is_aggression   No. Observations:                16406
Model:                          Logit   Df Residuals:                    16393
Method:                           MLE   Df Model:                           12
Date:                Mon, 13 Mar 2023   Pseudo R-squ.:                0.006248
Time:                        18:51:48   Log-Likelihood:                -8973.7
converged:                       True   LL-Null:                       -9030.1
Covariance Type:            nonrobust   LLR p-value:                 1.644e-18
                                                                                  coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------------------------

In [64]:
demographics_problems_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16407 entries, 0 to 17807
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Local ID                  16407 non-null  object        
 1   state_demographics        16407 non-null  object        
 2   enrollment_date           16407 non-null  datetime64[ns]
 3   enrollment_ym_derived     16407 non-null  datetime64[ns]
 4   enrollment_year           16407 non-null  int64         
 5   disability_level          16406 non-null  object        
 6   disability_level_ordered  16406 non-null  category      
 7   presented_problems        16407 non-null  object        
 8   Gender                    16407 non-null  object        
 9   Ethnicity                 16407 non-null  object        
 10  County                    16407 non-null  object        
 11  Gender_cat                16407 non-null  category      
 12  is_aggression     

In [47]:
# convert gender to an ordered categorical variable
demographics_problems_merged['Gender_cat'] = pd.Categorical(demographics_problems_merged['Gender'], 
                                                                          categories = ['Male', 'Female'], ordered = False)

In [43]:
demographics_problems_merged.head()

Unnamed: 0,Local ID,state_demographics,enrollment_date,enrollment_ym_derived,enrollment_year,disability_level,disability_level_ordered,presented_problems,Gender,Ethnicity,County
0,000083W,north carolina,2018-06-27,2018-06-01,2018,Mild,Mild,"aggression (physical, verbal, property destruc...",Female,Not of Hispanic origin,Swain
1,000083W,north carolina,2018-06-27,2018-06-01,2018,Mild,Mild,decrease in ability to participate in daily fu...,Female,Not of Hispanic origin,Swain
2,000083W,north carolina,2018-06-27,2018-06-01,2018,Mild,Mild,leaving unexpectedly,Female,Not of Hispanic origin,Swain
3,000083W,north carolina,2018-06-27,2018-06-01,2018,Mild,Mild,mental health symptoms,Female,Not of Hispanic origin,Swain
4,1021487,texas,2020-03-02,2020-03-01,2020,Mild,Mild,"aggression (physical, verbal, property destruc...",Male,Not of Hispanic origin,Tarrant


In [42]:
ny_counties.head()

Unnamed: 0,NAME,ABBREV,GNIS_ID,FIPS_CODE,SWIS,NYSP_ZONE,POP1990,POP2000,POP2010,POP2020,NYC,CALC_SQ_MI,DATEMOD,Shape_Leng,Shape_Area,geometry,County,state_demographics,index,county_perc
0,Albany,ALBA,974099,36001,10000,East,292594,294565,304204,314848,N,532.791779,2017-11-10,166077.834242,1379924000.0,"POLYGON ((-73.70733 42.78605, -73.70397 42.783...",Albany,new york,41.0,0.02001
1,Allegany,ALLE,974100,36003,20000,West,50470,49927,48946,46456,N,1035.209131,2019-04-26,210499.339692,2681179000.0,"POLYGON ((-78.29218 42.52140, -78.29047 42.521...",Allegany,new york,11.0,0.005368
2,Bronx,BRON,974101,36005,600000,Long Island,1203789,1332650,1385108,1472654,Y,57.472148,2019-10-04,57253.861278,148852200.0,"POLYGON ((-73.86567 40.90219, -73.86300 40.901...",Bronx,new york,165.0,0.080527
3,Broome,BROO,974102,36007,30000,Central,212160,200536,200600,198683,N,715.287465,2019-04-26,227933.332021,1852586000.0,"POLYGON ((-75.86416 42.41554, -75.86379 42.413...",,,,
4,Cattaraugus,CATT,974103,36009,40000,West,84234,83955,80317,77042,N,1324.309219,2019-04-26,276084.505177,3429945000.0,"POLYGON ((-79.02148 42.53804, -79.01937 42.537...",Cattaraugus,new york,17.0,0.008297


In [44]:
# use NY as a treatment group
demographics_problems_merged.state_demographics.unique()

array(['north carolina', 'texas', 'new york', 'colorado', 'iowa',
       'new hampshire', 'california'], dtype=object)

In [61]:
demographics_problems_merged.head()

Unnamed: 0,Local ID,state_demographics,enrollment_date,enrollment_ym_derived,enrollment_year,disability_level,disability_level_ordered,presented_problems,Gender,Ethnicity,County,Gender_cat,is_aggression
0,000083W,north carolina,2018-06-27,2018-06-01,2018,Mild,Mild,"aggression (physical, verbal, property destruc...",Female,Not of Hispanic origin,Swain,Female,1
1,000083W,north carolina,2018-06-27,2018-06-01,2018,Mild,Mild,decrease in ability to participate in daily fu...,Female,Not of Hispanic origin,Swain,Female,0
2,000083W,north carolina,2018-06-27,2018-06-01,2018,Mild,Mild,leaving unexpectedly,Female,Not of Hispanic origin,Swain,Female,0
3,000083W,north carolina,2018-06-27,2018-06-01,2018,Mild,Mild,mental health symptoms,Female,Not of Hispanic origin,Swain,Female,0
4,1021487,texas,2020-03-02,2020-03-01,2020,Mild,Mild,"aggression (physical, verbal, property destruc...",Male,Not of Hispanic origin,Tarrant,Male,1
