## Homework Assignment for Graduate Course in Healthcare Analytics
- Using DE-SynPUF files downloaded from https://www.cms.gov/Research-Statistics-Data-and-Systems/Downloadable-Public-Use-Files/SynPUFs/DE_Syn_PUF.html
- Assignment: use methods related to naive bayes to predict depression as response/target
- Using dataset already transformed from previous homeowrk assignment

In [1]:
# import and read file, use NaN for elements with no values
from pandas import DataFrame, read_csv
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np

df = pd.read_csv("combined_ben_sum_AddYear.csv", na_values = ['no info', '.'])

In [None]:
#subset of data needed to answer assignment question #4
newdf = df[['DESYNPUF_ID','Year','BENE_BIRTH_DT','BENE_SEX_IDENT_CD','BENE_RACE_CD',
            'BENE_ESRD_IND','SP_STATE_CODE','BENE_COUNTY_CD','SP_ALZHDMTA',
       'SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD','SP_DEPRESSN',
       'SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS', 'SP_RA_OA', 'SP_STRKETIA']]
#print(newdf.head(5))

In [None]:
# checking to see how many duplicate rows, all columns except year
# don't want to skew results if have same patient over 2008-2010 with same conditions repeated each year
# only want to include if patient has developed new conditions
dup_df = newdf[newdf.duplicated(['DESYNPUF_ID','BENE_BIRTH_DT','BENE_SEX_IDENT_CD','BENE_RACE_CD',
            'BENE_ESRD_IND','SP_STATE_CODE','BENE_COUNTY_CD','SP_ALZHDMTA',
       'SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD','SP_DEPRESSN',
       'SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS', 'SP_RA_OA', 'SP_STRKETIA']) == True].sort_values(by='DESYNPUF_ID')

In [None]:
# removing duplicate rows, 1424410 removed
df_no_dup_records_1 = newdf.drop_duplicates(['DESYNPUF_ID','BENE_BIRTH_DT','BENE_SEX_IDENT_CD','BENE_RACE_CD',
            'BENE_ESRD_IND','SP_STATE_CODE','BENE_COUNTY_CD','SP_ALZHDMTA',
       'SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD','SP_DEPRESSN',
       'SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS', 'SP_RA_OA', 'SP_STRKETIA'], keep='first')

In [None]:
# also remove all recorcds after patient's first diagnosis, any future conditions post-depression diagnosis would not be a 
# predictor or cause of depression
df_no_dup_records = df_no_dup_records_1.drop_duplicates(['DESYNPUF_ID','BENE_BIRTH_DT','SP_DEPRESSN'], keep='first')

In [None]:
#calculate age (year column - year from BENE_BIRTH_DT)
#newdf["Birth_Year"] = pd.to_datetime(newdf['BENE_BIRTH_DT']).dt.year
df_no_dup_records.loc[:,'Age'] = df_no_dup_records.loc[:,'Year'] - (pd.to_datetime(newdf.loc[:,'BENE_BIRTH_DT'],format='%Y%m%d').dt.year)

In [None]:
#display and check age calculation working as expected
display(df_no_dup_records.head(5))

In [None]:
# for each of the chronic condition col, prev data was 2 = no and 1 = yes
# replace 2 with 0 so have 0 = no and 1 = yes
for column in range(8,19):
    df_no_dup_records.iloc[:,column] = df_no_dup_records.iloc[:,column].replace(2,0)
# for BENE_ESRD_IND replace 'Y' with 1, 'N' is already set to 0
df_no_dup_records.loc[:,'BENE_ESRD_IND'] = df_no_dup_records.loc[:,'BENE_ESRD_IND'].replace('Y',1)

In [None]:
display(df_no_dup_records.head(5))

## Exploratory Data Analysis
- see HW8 for EDA on this dataset
- classes unbalanced so using under-sampling method to balance
- good summary here to learn more (https://www.kaggle.com/rafjaa/resampling-strategies-for-imbalanced-datasets)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
df_no_dup_records['SP_DEPRESSN'].value_counts()

In [None]:
# Class count
count_class_0, count_class_1 = df_no_dup_records.SP_DEPRESSN.value_counts()

# Divide by class
df_class_0 = df_no_dup_records[df_no_dup_records['SP_DEPRESSN'] == 0]
df_class_1 = df_no_dup_records[df_no_dup_records['SP_DEPRESSN'] == 1]

In [None]:
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

print('Random under-sampling:')
print(df_test_under.SP_DEPRESSN.value_counts())

df_test_under.SP_DEPRESSN.value_counts().plot(kind='bar', title='Count (SP_DEPRESSN)');

## Naive Bayes Model

In [None]:
# splitting data
X = df_test_under.iloc[:,[3,4,5,8,9,10,11,12,14,15,16,17,18,19]].values
y = df_test_under.iloc[:,13].values

#Splitting the data into Training Set and Test Set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.3,random_state=0)

#Normalizing the features
from sklearn.preprocessing import StandardScaler
sc_X= StandardScaler()
X_train= sc_X.fit_transform(X_train)
X_test= sc_X.transform(X_test)
#print(X_test)

In [None]:
#Fitting Classifier to Training Set. Create a classifier object here and call it classifierObj
#Gaussian is parameter-less
from sklearn.naive_bayes import GaussianNB
classifierObj= GaussianNB()
classifierObj.fit(X_train, y_train)