In [66]:
import pandas as pd
import matplotlib as plot
import numpy as np
import sys
import os
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
import re

In [82]:
#import data from csv
base_path = "/Users/apple/Documents/gatech/humana_analytics/humana_2018/" # path to humana folder

data = pd.read_csv(base_path + "/data/TAMU_FINAL_DATASET_2018.csv")

In [148]:
data.head()
data.describe() 
data.shape
#contains one lakh rows

(100000, 448)

Take Aways -

1. One Lakh rows
2. Check for NAs, because there are - Impute them!
3. Age distribution of the population is on the older side - min age is 40, max is 95

In [85]:
#Cleaning the dataset a bit to make it more general

#standardise column names
data.columns = [s.lower() for s in data.columns]

# find columns with na values
def showmissing(df_train):
    return df_train.columns[df_train.isnull().any()].tolist()

#missing data percentages
(data[showmissing(data)].isnull().sum().sort_values(ascending = False)/100000)*100


diab_type                        64.693
decile_struggle_med_lang         25.215
population_density_centile_us    12.179
online_purchaser                 12.179
dwelling_type                    12.179
education_level                  12.179
length_residence                 12.179
population_density_centile_st    12.179
num_person_household             12.179
college                          12.179
est_bmi_decile                   12.179
online_user                      12.179
pct_above_poverty_line           12.179
pct_below_poverty_line           12.179
home_value                       12.179
est_net_worth                    12.179
est_income                       12.179
index_health_ins_engage          12.179
index_health_ins_influence       12.179
pcp_assignment                    0.157
mco_hlvl_plan_cd                  0.033
mco_prod_type_cd                  0.033
hospice_ind                       0.032
esrd_ind                          0.032
lis                               0.030


In [9]:
'''


data['recon_rx_risk_score_nbr'].describe() 
data['recon_ma_risk_score_nbr'].describe()
data['population_density_centile_st']

'''

"\n\n\ndata['recon_rx_risk_score_nbr'].describe() \ndata['recon_ma_risk_score_nbr'].describe()\ndata['population_density_centile_st']\n\n"

In [86]:
#more exploration with respect to target variable

# We can see that ami_flag is relatively varied among Males and Females. 
pd.crosstab(index=data['sex_cd'], columns=data['ami_flag'], margins=True)
 

ami_flag,0,1,All
sex_cd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
F,55476,1292,56768
M,41771,1431,43202
All,97247,2723,99970


In [87]:
### Cleaning sex_cd Column

# Cleaning sex_cd - there are 0.03% NAs in this column. Lets see how we can impute it - or we can remove those rows.
data.sex_cd.unique() #there are no "U" as expected according to data dictionary. Assuming NAs to be Us
data.sex_cd.isnull().sum()  # 30 values. 

#Interestingly all sex_cd = NAs are people with age=40
data[data["sex_cd"].isnull()]['age']

buffer = data[data["sex_cd"].isnull()]

#We can see that we cannot totally remove the rows believing it to be a data problem because only 10 of the columns are totally NA. 
(buffer[showmissing(buffer)].isnull().sum().sort_values(ascending = False)/30)*100

# For now lets change all the sex_cd = NA with "U"

data["sex_cd"].fillna("U", inplace = True)
data.sex_cd.unique() # F- Female, M- Male, U - Unknown


array(['F', 'M', 'U'], dtype=object)

In [117]:
## Handling all the NAs value in the dataset. 

#Step1: Understand why the data has NA. Can it mean something while keeping the domain in mind
#Step2: Action to take - remove the rows or impute?
#Step3: Return a dataset with no NAs

col_with_nas = showmissing(data)

# mco_hlvl_plan_cd and mco_hlvl_type_cd
data.mco_hlvl_plan_cd.unique()  # ['MAPD', 'MA', nan]
data.mco_prod_type_cd.unique()  # ['LPPO', 'PFFS', 'HMO', 'RPPO', nan]

#both are NAs for the same person
data[data["mco_hlvl_plan_cd"].isnull()]['id'] == data[data["mco_prod_type_cd"].isnull()]['id'] 

# Decision - Both of these variables are intutively less likely to determine whether a patient will get AMI. 
# So let the NAs be. 

data[data["est_income"].isnull()]
data.pcp_assignment.unique()

#diab_type : Hypothesis - this is NULL whenever diabetes == 0 
sum(data[data['diab_type'].isnull()]['id'] == data[data['diabetes'] == 0]['id']) == sum(data['diabetes'] == 0)
data["diab_type"].fillna("NO", inplace = True)

#esrd_ind 


ValueError: Can only compare identically-labeled Series objects

In [123]:
trial_cols = ["id","sex_cd","age","population_density_centile_us" , "online_purchaser", "dwelling_type","education_level","length_residence","population_density_centile_st","num_person_household","college","est_bmi_decile","online_user","pct_above_poverty_line","pct_below_poverty_line","home_value","est_net_worth","est_income","index_health_ins_engage", "index_health_ins_influence"]   

#df_col = data[trial_cols]
#df_col[df_col["population_density_centile_st"].isnull()] 

#sorted(col_with_nas)
# Remarks - All the NAs for these columns occur for the same people. This has to be a data collection error! 
# 12179 rows

In [130]:
trial_cols1 = ["lis","institutional","dual","orig_reas_entitle_cd","id","age","sex_cd"]
df_col1 = data[trial_cols1]
df_col1[df_col1["lis"].isnull()]

#These are the same set of people who had their "sex_cd" as NA. Data source must be the same as sex_cd!

Unnamed: 0,lis,institutional,dual,orig_reas_entitle_cd,id,age,sex_cd
5040,,,,,5041,40,U
6049,,,,,6050,40,U
6573,,,,,6574,40,U
15596,,,,,15597,40,U
16040,,,,,16041,40,U
16157,,,,,16158,40,U
17056,,,,,17057,40,U
18352,,,,,18353,40,U
23929,,,,,23930,40,U
32216,,,,,32217,40,U


In [129]:
#trial_cols2 = ["hospice_ind","esrd_ind","id","age","sex_cd"]
trial_cols2 = ["mco_hlvl_plan_cd","mco_prod_type_cd","id","age","sex_cd"]

df_col2 = data[trial_cols2]
df_col2[df_col2["mco_hlvl_plan_cd"].isnull()]

# For ["hospice_ind","esrd_ind"] - Same people like above, except two Males - id = [68995,99088]
# For ["mco_hlvl_plan_cd","mco_prod_type_cd"] - Same like above except three Males - id [19348,55303,78200]

Unnamed: 0,mco_hlvl_plan_cd,mco_prod_type_cd,id,age,sex_cd
5040,,,5041,40,U
6049,,,6050,40,U
6573,,,6574,40,U
15596,,,15597,40,U
16040,,,16041,40,U
16157,,,16158,40,U
17056,,,17057,40,U
18352,,,18353,40,U
19347,,,19348,84,M
23929,,,23930,40,U


In [144]:
# Final NAs analysis summary

#ids_30 - ids of people who have their sex_cd = U and ["lis","institutional","dual","orig_reas_entitle_cd"] = NA
ids_30 = list(data[data.sex_cd == "U"]["id"] )

#ids_2 - addition to ids_30, two more person who have ["hospice_ind","esrd_ind"] = NA
ids_2 = list(data[(data.sex_cd != "U") & (data["hospice_ind"].isnull())]["id"])

#ids_3 - addition to ids_30, three more person who have ["mco_hlvl_plan_cd","mco_prod_type_cd"] = NA
ids_3 = list(data[(data.sex_cd != "U") & (data["mco_hlvl_plan_cd"].isnull())]["id"])

#pcp_assignment - Waiting to hear back from Humana Team on the meaning behind this

# All Variables with 12.79% NAs are demographic variables!
#["population_density_centile_us" , "online_purchaser", "dwelling_type","education_level","length_residence","population_density_centile_st","num_person_household","college","est_bmi_decile","online_user","pct_above_poverty_line","pct_below_poverty_line","home_value","est_net_worth","est_income","index_health_ins_engage", "index_health_ins_influence"]   



In [141]:
# filtering out variables of interest 

#personal details
primary_cols = ["id","age","sex_cd"]
#demographic cols with NAs
demographic_cols = ["population_density_centile_us" , "online_purchaser", "dwelling_type","education_level","length_residence","population_density_centile_st","num_person_household","college","est_bmi_decile","online_user","pct_above_poverty_line","pct_below_poverty_line","home_value","est_net_worth","est_income","index_health_ins_engage", "index_health_ins_influence"]



In [162]:
# Removing 32 persons whose some imp health variables are NAs
# For now removing it, we can think of how to improve this later

remove_ids = ids_30 + ids_2 + ids_3
data = data[~data["id"].isin(remove_ids)]
data.shape

(99965, 448)

In [7]:
#Split into train and test

target_df = data['ami_flag']
variable_df = data.drop('ami_flag', axis = 1)

X_train, X_test, y_train, y_test = train_test_split(variable_df, target_df, test_size=0.2) #20% data in test

In [8]:
# Breaking up the variables into groups so it is easy to fit in the models

