# **Imports**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy import stats
import collections
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report, roc_curve, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from IPython.core.interactiveshell import InteractiveShell
import warnings

#importing packages for modeling
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.layers import Dropout
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import BatchNormalization

%matplotlib inline
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
warnings.filterwarnings('ignore')

XGBoostError: XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed (vcomp140.dll or libgomp-1.dll for Windows, libomp.dylib for Mac OSX, libgomp.so for Linux and other UNIX-like OSes). Mac OSX users: Run `brew install libomp` to install OpenMP runtime.
  * You are running 32-bit Python on a 64-bit OS
Error message(s): ['dlopen(/Users/bhargavaramarajudandu/opt/anaconda3/lib/python3.8/site-packages/xgboost/lib/libxgboost.dylib, 6): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib\n  Referenced from: /Users/bhargavaramarajudandu/opt/anaconda3/lib/python3.8/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: image not found']


In [None]:
target_names=['Non-Persistent', 'Persistent']

# Functions

In [None]:
def evaluation_metrics(y_test, y_pre, target_names):
    #scores
    print("Accuracy :",accuracy_score(y_test,y_pre))
    print("Precision :",precision_score(y_test,y_pre))
    print("Recall :",recall_score(y_test,y_pre))
    print("F1 Score :",f1_score(y_test,y_pre))

    print(classification_report(y_test, y_pre, target_names=target_names))

    #AUC
    fpr, tpr, _ = roc_curve(y_test,  y_pre)
    auc = roc_auc_score(y_test, y_pre)
    print("AUC :", auc)

    #ROC
    plt.plot(fpr,tpr,label="uc={:.3f})".format(auc))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc=4)
    plt.show()

    #CM matrix
    matrix = confusion_matrix(y_test, y_pre)
    cm = pd.DataFrame(matrix, index=target_names, columns=target_names)

    sns.heatmap(cm, annot=True, cbar=None, cmap="Blues", fmt = 'g')
    plt.title("Confusion Matrix"), plt.tight_layout()
    plt.ylabel("True Class"), plt.xlabel("Predicted Class")
    plt.show()

In [None]:
def logistic(X_train,X_test,y_train,y_test):
    lr=LogisticRegression()
    lr.fit(X_train,y_train)
    y_pre=lr.predict(X_test)
    evaluation_metrics(y_test, y_pre, target_names)

In [None]:
def Ridge(X_train,X_test,y_train,y_test):
    #train the model
    ridge = RidgeClassifier(random_state=2)
    ridge.fit(X_train, y_train)
    #predictions
    y_pre = ridge.predict(X_test)
    evaluation_metrics(y_test, y_pre, target_names)
  

# Reading data

In [None]:
xls = pd.ExcelFile('Healthcare_dataset.xlsx')
df= pd.read_excel(xls, 'Dataset')

# **Data Understanding**

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.columns=[x.lower() for x in df.columns]

### **Analyzing dependency of variable (Before Transformation)**

In [None]:
classes=df['persistency_flag'].value_counts()
normal_share=round(classes[0]/df['persistency_flag'].count()*100,2)
fraud_share=round(classes[1]/df['persistency_flag'].count()*100, 2)
print("Non-Persistent : {} %".format(normal_share))
print("Persistent : {} %".format(fraud_share))

In [None]:
cat_corr = df.apply(lambda x : pd.factorize(x)[0]).corr(method='pearson', min_periods=1)[['persistency_flag']]
np.abs(cat_corr).sort_values(by=['persistency_flag'], ascending=False)

## **Missing Values**

In [None]:

df.isnull().sum()

## **Outlier Analysis**

In [None]:
fig = px.histogram(df, x="dexa_freq_during_rx",
                   marginal="box", # or violin, rug
                   hover_data=df.columns)
fig.show()

In [None]:
fig = px.histogram(df, x="count_of_risks",
                   marginal="box", # or violin, rug
                   hover_data=df.columns)
fig.show()

In [None]:
plt.figure(figsize=(20,10))
var ="count_of_risks"
sns.boxplot(x=var,y ="persistency_flag",data=df)

In [None]:
plt.figure(figsize=(20,10))
var ="dexa_freq_during_rx"
sns.boxplot(x=var,y ="persistency_flag",data=df)

In [None]:
print("Count of risks skweness: ",df["count_of_risks"].skew())
print("Count of risks Kurtosis: ",df["count_of_risks"].kurt())

## Data shows a moderate positive skewed data on this column and fairly platykurtic
## Means the data has little outliers

In [None]:
print("dexa_freq_during_rx skweness: ",df["dexa_freq_during_rx"].skew())
print("dexa_freq_during_rx Kurtosis: ",df["dexa_freq_during_rx"].kurt())
## very high positive skewed and also with very high kurtosis(Platykurtic)
## This suggests Presence of alot of outliers.

In [None]:
#standardizing dexa_freq_during_rx df
dexa_scaled = StandardScaler().fit_transform(df['dexa_freq_during_rx'][:,np.newaxis]);
low_range = dexa_scaled[dexa_scaled[:,0].argsort()][:10]
high_range= dexa_scaled[dexa_scaled[:,0].argsort()][-10:]
print('outer range (low) of the distribution:')
print(low_range)
print('\nouter range (high) of the distribution:')
print(high_range)

In [None]:
scaler = RobustScaler()
df['dexa_freq_during_rx'] = scaler.fit_transform(df['dexa_freq_during_rx'].values.reshape(-1,1))

In [None]:
scaler = RobustScaler()
df['count_of_risks'] = scaler.fit_transform(df['count_of_risks'].values.reshape(-1,1))

In [None]:
''' Detection '''
# IQR
Q1 = np.percentile(df['dexa_freq_during_rx'], 25, 
                   interpolation = 'midpoint') 
  
Q3 = np.percentile(df['dexa_freq_during_rx'], 75,
                   interpolation = 'midpoint') 
IQR = Q3 - Q1 
  
print("Old Shape: ", df.shape) 
  
# Upper bound
upper = np.where(df['dexa_freq_during_rx'] >= (Q3+1.5*IQR))
# Lower bound
lower = np.where(df['dexa_freq_during_rx'] <= (Q1-1.5*IQR))
  
print("lower",lower[0])
print("Upper",upper[0])

''' Removing the Outliers '''
df.drop(upper[0], inplace = True)
df.drop(lower[0], inplace = True)

print("New Shape: ", df.shape)

df = df.reset_index(drop=True)

In [None]:
''' Detection '''
# IQR
Q1 = np.percentile(df['count_of_risks'], 25, 
                   interpolation = 'midpoint') 
  
Q3 = np.percentile(df['count_of_risks'], 75,
                   interpolation = 'midpoint') 
IQR = Q3 - Q1 
  
print("Old Shape: ", df.shape) 
  
# Upper bound
upper = np.where(df['count_of_risks'] >= (Q3+1.5*IQR))
# Lower bound
lower = np.where(df['count_of_risks'] <= (Q1-1.5*IQR))
  
print("lower",lower[0])
print("Upper",upper[0])

''' Removing the Outliers '''
df.drop(upper[0], inplace = True)
df.drop(lower[0], inplace = True)

print("New Shape: ", df.shape) 

df = df.reset_index(drop=True)

## **Describe Data**

In [None]:
#distribution of categorical features
df.describe(include=['O'])

In [None]:
df.groupby(['persistency_flag']).mean().T

In [None]:
df.groupby(['gender']).mean().T

In [None]:
df.groupby(['race']).mean()

In [None]:
df.groupby(['ethnicity']).mean().T

In [None]:
df.groupby(['age_bucket']).mean().T

In [None]:
df.groupby(['ntm_speciality']).mean().T

In [None]:
df.groupby(['ntm_specialist_flag']).mean().T

In [None]:
df.groupby(['ntm_speciality_bucket']).mean().T

In [None]:
df.groupby(['ntm_speciality_bucket']).mean().T

In [None]:
df.groupby(['risk_chronic_liver_disease']).mean().T

In [None]:
df.groupby(['risk_family_history_of_osteoporosis']).mean().T

In [None]:
df.groupby(['risk_low_calcium_intake']).mean().T

In [None]:
df.groupby(['risk_vitamin_d_insufficiency']).mean().T

In [None]:
df.groupby(['risk_excessive_thinness']).mean().T

In [None]:
df.groupby(['risk_hysterectomy_oophorectomy']).mean().T

In [None]:
df.groupby(['risk_estrogen_deficiency']).mean().T

In [None]:
df.groupby(['risk_immobilization']).mean().T

In [None]:

df.groupby(['risk_recurring_falls']).mean().T

# **Data Wrangling , Transformation and Standardization**

In [None]:
df = df.drop(['ptid'], axis=1)

In [None]:
mapper = {'N': 0, 'Y':1}
df = df.replace(mapper)

In [None]:
df['persistency_flag'] = df['persistency_flag'].replace(['Non-Persistent', 'Persistent'],[0, 1])
df.head()

### **Analyzing dependency of variable (After Transformation)**

In [None]:
np.abs(df.corr()).sort_values(by=['persistency_flag'], ascending=False)

In [None]:
plt.subplots(figsize=(15,10))
sns.heatmap(df.corr())

### *Creating Dummy values*

In [None]:
X=df.drop(['persistency_flag'],axis=1)
y=df['persistency_flag']

X = pd.get_dummies(X)
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.3)

In [None]:
df_train = X_train.copy()
df_train['persistency_flag'] = y_train
df_train.head()

### *Come Imbalanced dataset*

In [None]:
classes=df_train['persistency_flag'].value_counts()
normal_share=round(classes[0]/df_train['persistency_flag'].count()*100,2)
fraud_share=round(classes[1]/df_train['persistency_flag'].count()*100, 2)
print("Non-Persistent : {} %".format(normal_share))
print("Persistent : {} %".format(fraud_share))

In [None]:
fig = px.histogram(df_train, x="persistency_flag", color="persistency_flag", title="Persistent class histogram")
fig.show()

### *Upsampling*

In [None]:
# Upsampling
df_minority_upsampled = resample(df_train[df_train['persistency_flag'] == 1], 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_train[df_train['persistency_flag'] == 0]),    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_train = pd.concat([df_train[df_train['persistency_flag'] == 0], df_minority_upsampled])
 
# Display new class counts
df_train.persistency_flag.value_counts()

In [None]:
X_train=df_train.drop(['persistency_flag'],axis=1)
y_train=df_train['persistency_flag']

In [None]:
fig = px.histogram(df_train, x="persistency_flag", color="persistency_flag", title="Persistent class histogram")
fig.show()