## HIV Incidence Case Study 


In [2]:
import numpy as np
import pandas as pd
import scipy.stats as stats


from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

import statsmodels.api as sm

import matplotlib.pyplot as plt
plt.style.use('ggplot')

%matplotlib inline


In [6]:
main = pd.read_csv('forecast_HIV_infections/data/main_data.csv', index_col=0)

In [30]:
main['mme_percap'].fillna(main['mme_percap'].sum(), inplace=True)

In [32]:
main['partD30dayrxrate'].fillna(main['partD30dayrxrate'].sum(), inplace=True)

In [None]:
drugdep              3139 non-null float64
pctunmetneed         3139 non-null float64
nonmedpain  

In [34]:
main['drugdep'].fillna(main['drugdep'].sum(), inplace=True)

In [36]:
main['pctunmetneed'].fillna(main['pctunmetneed'].sum(), inplace=True)

In [37]:
main['nonmedpain'].fillna(main['nonmedpain'].sum(), inplace=True)

In [38]:
main.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3140 entries, 0 to 3139
Data columns (total 39 columns):
county_code          3140 non-null int64
COUNTY               3140 non-null object
STATEABBREVIATION    3140 non-null object
YEAR                 3140 non-null int64
AMAT_fac             3140 non-null float64
HIVdiagnoses         3140 non-null float64
HIVincidence         3140 non-null float64
HIVprevalence        3140 non-null float64
MH_fac               3140 non-null float64
Med_AMAT_fac         3140 non-null float64
Med_MH_fac           3140 non-null float64
Med_SA_fac           3140 non-null float64
Med_SMAT_fac         3140 non-null float64
Med_TMAT_fac         3140 non-null float64
PLHIV                3140 non-null float64
Population           3140 non-null float64
SA_fac               3140 non-null float64
SMAT_fac             3140 non-null float64
TMAT_fac             3140 non-null float64
drugdeathrate        3140 non-null float64
drugdeathrate_est    3140 non-null floa

In [82]:
y = np.array(main['HIVincidence'])

In [90]:
X = np.array(main.drop(['HIVincidence','COUNTY','STATEABBREVIATION', 'HIVdiagnoses'], axis=1))

In [91]:
model = sm.OLS(y, X).fit()

In [92]:
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.196
Model:,OLS,Adj. R-squared:,0.187
Method:,Least Squares,F-statistic:,22.2
Date:,"Fri, 17 Apr 2020",Prob (F-statistic):,5.74e-121
Time:,12:13:16,Log-Likelihood:,-12930.0
No. Observations:,3140,AIC:,25930.0
Df Residuals:,3105,BIC:,26140.0
Df Model:,34,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-7.924e-06,1.86e-05,-0.426,0.670,-4.44e-05,2.86e-05
const,-0.0014,0.001,-2.285,0.022,-0.003,-0.000
x2,1.0392,1.845,0.563,0.573,-2.578,4.657
x3,0.0350,0.002,18.403,0.000,0.031,0.039
x4,-0.0038,0.371,-0.010,0.992,-0.732,0.724
x5,-1.2103,2.085,-0.580,0.562,-5.298,2.878
x6,0.0068,0.406,0.017,0.987,-0.789,0.803
x7,0.1683,0.235,0.716,0.474,-0.292,0.629
x8,-0.8199,0.712,-1.152,0.250,-2.216,0.576

0,1,2,3
Omnibus:,9169.883,Durbin-Watson:,1.982
Prob(Omnibus):,0.0,Jarque-Bera (JB):,507994238.913
Skew:,39.678,Prob(JB):,0.0
Kurtosis:,1971.873,Cond. No.,37400000.0


In [93]:
pval = model.pvalues

In [94]:
cols = main.columns.drop(['HIVincidence','COUNTY','STATEABBREVIATION', 'HIVdiagnoses'])

In [95]:
cols

Index(['county_code', 'YEAR', 'AMAT_fac', 'HIVprevalence', 'MH_fac',
       'Med_AMAT_fac', 'Med_MH_fac', 'Med_SA_fac', 'Med_SMAT_fac',
       'Med_TMAT_fac', 'PLHIV', 'Population', 'SA_fac', 'SMAT_fac', 'TMAT_fac',
       'drugdeathrate', 'drugdeathrate_est', 'drugdeaths', 'mme_percap',
       'partD30dayrxrate', 'pctunins', 'num_SSPs', 'bup_phys', 'drugdep',
       'pctunmetneed', 'nonmedpain', 'ADULTMEN', 'MSM12MTH', 'MSM5YEAR',
       '%msm12month', '%msm5yr', 'unemployment_rate', 'poverty_rate',
       'household_income', 'percent_uninsured'],
      dtype='object')

In [96]:
pval_df = pd.DataFrame(pval, index=cols, columns=['pval'])

In [102]:
new = pval_df[pval_df['pval'] <= .25]

In [103]:
sig_columns = list(new.index)

In [104]:
X2 = main[sig_columns]

In [105]:
model = sm.OLS(y, X2).fit()

In [106]:
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.182
Model:,OLS,Adj. R-squared:,0.18
Method:,Least Squares,F-statistic:,115.9
Date:,"Fri, 17 Apr 2020",Prob (F-statistic):,1.55e-132
Time:,12:14:26,Log-Likelihood:,-12957.0
No. Observations:,3140,AIC:,25930.0
Df Residuals:,3133,BIC:,25970.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
YEAR,-0.0005,0.000,-3.107,0.002,-0.001,-0.000
HIVprevalence,0.0348,0.001,23.753,0.000,0.032,0.038
Med_SMAT_fac,-0.2906,0.118,-2.454,0.014,-0.523,-0.058
num_SSPs,1.1995,0.789,1.520,0.129,-0.348,2.747
drugdep,0.0001,0.002,0.066,0.947,-0.004,0.004
MSM12MTH,1.4238,0.768,1.853,0.064,-0.083,2.930
MSM5YEAR,-0.9209,0.497,-1.853,0.064,-1.896,0.054

0,1,2,3
Omnibus:,9120.856,Durbin-Watson:,1.981
Prob(Omnibus):,0.0,Jarque-Bera (JB):,489521741.614
Skew:,39.105,Prob(JB):,0.0
Kurtosis:,1935.731,Cond. No.,31100.0
