#### Import Modules

In [29]:
import pandas as pd
import numpy as np
import time
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, normalize
from statsmodels.sandbox.regression.predstd import wls_prediction_std


data = pd.read_csv("training_data.csv")

#### What are the unique conditions?

In [30]:
data['condition'].unique()

array(['atrial_fib', 'chronic_kidney', 'pulmonary', 'depression',
       'diabetes', 'ischemia', 'asthma', 'hyperlip', 'hypertension',
       'arthritis', 'psychotic', 'stroke'], dtype=object)

#### How many conditions are there?

In [31]:
len(data['condition'].unique())

12

#### How many years of data do we have?

In [32]:
data['year'].unique()

array([2017, 2016, 2015], dtype=int64)

#### Create dataset

In [33]:
#potentially create region?
#create 12 variables for type of condition
binaries = pd.get_dummies(data['condition'])
newdata = pd.merge(data, binaries, left_index=True, right_index=True)

In [34]:
newdata.columns

Index(['state', 'condition', 'year', 'prev_less_65', 'prev_age_65_74',
       'prev_age_75_84', 'prev_above_84', 'prev_male', 'prev_female',
       'prev_white', 'prev_black', 'prev_hispanic', 'prev_other',
       'current_cost', 'last_year_cost', 'arthritis', 'asthma', 'atrial_fib',
       'chronic_kidney', 'depression', 'diabetes', 'hyperlip', 'hypertension',
       'ischemia', 'psychotic', 'pulmonary', 'stroke'],
      dtype='object')

In [35]:
finaldata = newdata[['prev_less_65', 'prev_age_65_74',
       'prev_age_75_84', 'prev_above_84', 'prev_male', 'prev_female',
       'prev_white', 'prev_black', 'prev_hispanic', 'prev_other',
       'current_cost', 'last_year_cost', 'arthritis', 'asthma', 'atrial_fib',
       'chronic_kidney', 'depression', 'diabetes', 'hyperlip', 'hypertension',
       'ischemia', 'psychotic', 'pulmonary', 'stroke']]

In [36]:
x_data = finaldata[['prev_less_65', 'prev_age_65_74',
       'prev_age_75_84', 'prev_above_84', 'prev_male', 'prev_female',
       'prev_white', 'prev_black', 'prev_hispanic', 'prev_other', 'last_year_cost',
       'arthritis', 'asthma', 'atrial_fib','chronic_kidney', 'depression',
        'diabetes', 'hyperlip', 'hypertension',
       'ischemia', 'psychotic', 'pulmonary', 'stroke']]
y_data = data.loc[:,"current_cost"]

#### Create test and training datasets

In [37]:
x_train, x_test, y_train, y_test = train_test_split(x_data,y_data,test_size = .3, random_state = 100)

# Linear Regression

In [38]:
Xtrain = sm.add_constant(x_train)
Xtest = sm.add_constant(x_test)
olstrainmodel = sm.OLS(y_train,Xtrain)

In [39]:
olstrainresult = olstrainmodel.fit()

In [40]:
print(olstrainresult.summary())

                            OLS Regression Results                            
Dep. Variable:           current_cost   R-squared:                       0.941
Model:                            OLS   Adj. R-squared:                  0.940
Method:                 Least Squares   F-statistic:                     822.5
Date:                Sun, 24 Nov 2019   Prob (F-statistic):               0.00
Time:                        14:46:24   Log-Likelihood:                -11616.
No. Observations:                1150   AIC:                         2.328e+04
Df Residuals:                    1127   BIC:                         2.339e+04
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const           4285.2293    591.180      7.

In [41]:
testpred = olstrainresult.predict(Xtest)

#### MAE on Test Set

In [42]:
np.abs(testpred - y_test).mean()

3554.2014831916686

#### RMSE on Test Set

In [43]:
np.sqrt(np.mean((testpred - y_test)**2))

6222.150810763095

#### Create Prediction Dataset for 2018

In [52]:
newdata2 = pd.read_csv("aggregations/aggregations/df_state_2017.csv")

In [53]:
dataofinterest = newdata2[['nppes_provider_state', 'beneficiary_age_less_65_total',
       'beneficiary_age_65_74_total', 'beneficiary_age_75_84_total',
       'beneficiary_age_greater_84_total', 'beneficiary_male_total',
       'beneficiary_female_total','beneficiary_race_white_total',
       'beneficiary_race_black_total','beneficiary_race_hispanic_total',
       'beneficiary_race_other_total']]

In [54]:
dataofinterest.rename(columns={'nppes_provider_state':'state', 'beneficiary_age_less_65_total':'prev_less_65',
       'beneficiary_age_65_74_total':'prev_age_65_74', 'beneficiary_age_75_84_total':'prev_age_75_84',
       'beneficiary_age_greater_84_total':'prev_above_84', 'beneficiary_male_total':'prev_male',
       'beneficiary_female_total':'prev_female','beneficiary_race_white_total':'prev_white',
       'beneficiary_race_black_total':'prev_black','beneficiary_race_hispanic_total':'prev_hispanic',
       'beneficiary_race_other_total':'prev_other'},inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [55]:
twosev_data = newdata[newdata['year']==2017]

In [56]:
twosev_data.drop(columns=['condition', 'year', 'prev_less_65', 'prev_age_65_74',
       'prev_age_75_84', 'prev_above_84', 'prev_male', 'prev_female',
       'prev_white', 'prev_black', 'prev_hispanic', 'prev_other','last_year_cost'],inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [57]:
twosev_data.rename(columns={"current_cost": "last_year_cost"},inplace = True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [58]:
predtwoeightdataprep = pd.merge(dataofinterest, twosev_data, how='inner', left_on='state', right_on='state')

In [59]:
predtwoeightdata = predtwoeightdataprep[['prev_less_65', 'prev_age_65_74', 'prev_age_75_84',
       'prev_above_84', 'prev_male', 'prev_female', 'prev_white', 'prev_black',
       'prev_hispanic', 'prev_other', 'last_year_cost', 'arthritis', 'asthma',
       'atrial_fib', 'chronic_kidney', 'depression', 'diabetes', 'hyperlip',
       'hypertension', 'ischemia', 'psychotic', 'pulmonary', 'stroke']]

# Predictions

In [60]:
newX = sm.add_constant(predtwoeightdata)

In [61]:
twoeightpred = olstrainresult.predict(newX)

In [70]:
def transform_exog_to_model(fit, exog):
    transform=True
    self=fit

    # The following is lifted straight from statsmodels.base.model.Results.predict()
    if transform and hasattr(self.model, 'formula') and exog is not None:
        from patsy import dmatrix
        exog = dmatrix(self.model.data.orig_exog.design_info.builder,
                       exog)

    if exog is not None:
        exog = np.asarray(exog)
        if exog.ndim == 1 and (self.model.exog.ndim == 1 or
                               self.model.exog.shape[1] == 1):
            exog = exog[:, None]
        exog = np.atleast_2d(exog)  # needed in count model shape[1]

    # end lifted code
    return exog

transformed_exog = transform_exog_to_model(olstrainresult, newX)
prstd2, iv_l2, iv_u2 = wls_prediction_std(olstrainresult, newX, weights=[1])

In [92]:
newvalues = np.concatenate((np.array(iv_l2).reshape((len(iv_l2),1)),np.array(twoeightpred).reshape((len(twoeightpred),1)),np.array(iv_u2).reshape((len(iv_u2),1))),axis=1)

In [93]:
predtwoeightdataprep

Unnamed: 0,state,prev_less_65,prev_age_65_74,prev_age_75_84,prev_above_84,prev_male,prev_female,prev_white,prev_black,prev_hispanic,...,atrial_fib,chronic_kidney,depression,diabetes,hyperlip,hypertension,ischemia,psychotic,pulmonary,stroke
0,AK,75808.0,245964.0,136494.0,47582.0,230602.0,302881.0,362767.0,13953.0,9917.0,...,1,0,0,0,0,0,0,0,0,0
1,AK,75808.0,245964.0,136494.0,47582.0,230602.0,302881.0,362767.0,13953.0,9917.0,...,0,1,0,0,0,0,0,0,0,0
2,AK,75808.0,245964.0,136494.0,47582.0,230602.0,302881.0,362767.0,13953.0,9917.0,...,0,0,0,0,0,0,0,0,1,0
3,AK,75808.0,245964.0,136494.0,47582.0,230602.0,302881.0,362767.0,13953.0,9917.0,...,0,0,1,0,0,0,0,0,0,0
4,AK,75808.0,245964.0,136494.0,47582.0,230602.0,302881.0,362767.0,13953.0,9917.0,...,0,0,0,1,0,0,0,0,0,0
5,AK,75808.0,245964.0,136494.0,47582.0,230602.0,302881.0,362767.0,13953.0,9917.0,...,0,0,0,0,0,0,1,0,0,0
6,AL,1306230.0,2659740.0,1837150.0,768976.0,2790094.0,3888074.0,5037070.0,856943.0,30872.0,...,1,0,0,0,0,0,0,0,0,0
7,AL,1306230.0,2659740.0,1837150.0,768976.0,2790094.0,3888074.0,5037070.0,856943.0,30872.0,...,0,0,0,0,0,0,0,0,0,0
8,AL,1306230.0,2659740.0,1837150.0,768976.0,2790094.0,3888074.0,5037070.0,856943.0,30872.0,...,0,1,0,0,0,0,0,0,0,0
9,AL,1306230.0,2659740.0,1837150.0,768976.0,2790094.0,3888074.0,5037070.0,856943.0,30872.0,...,0,0,0,0,0,0,0,0,1,0
