# D209 Performance Assessment - Task 1: Classification Analysis
### by Bader Ale 

# ▶ Research Question
Our research questions is as follows: __Given certain patient characteristics, can we classify whether a patient is hypertensive or not__.

In [None]:
# Importing libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from IPython.core.interactiveshell import InteractiveShell # Importing so we can run multiple lines in one cell
InteractiveShell.ast_node_interactivity = "all" # Code so multiple lines in one cell can be ran simultaenously 

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Importing original dataset
df = pd.read_csv('F:/GitHub Repos/WGU_MSDA/D209_Data Mining I/medical_clean.csv')

# ▶ Data Cleaning  

In [None]:
df.head()

In [None]:
df.drop(['CaseOrder','Customer_id','Interaction', 'UID', 'City', 'State', 
        'County', 'Zip', 'Lat', 'Lng', 'Population', 'TimeZone', 'Job', 
        'Children','ReAdmis', 'Full_meals_eaten','vitD_supp', 'Soft_drink',
        'Additional_charges', 'Item1', 'Item2', 'Item3', 'Item4', 'Item5',
        'Item6', 'Item7', 'Item8'], axis=1, inplace=True) 

In [None]:
df.head()

## Detection and Treatment of Nulls

In [None]:
# Checking for null values
df.isnull().sum()

## Detection and Treatment of Duplicated Values

In [None]:
# Checking for duplicates
df.duplicated().value_counts()

## Detection and Treatment of Outliers

In [None]:
df.dtypes

In [None]:
# Scaling data to make it easier to visualize outliers
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(df[['Age', 'Income', 'VitD_levels', 'Doc_visits', 'Initial_days', 'TotalCharge']])
df_scaled = pd.DataFrame(scaler.fit_transform(df[['Age', 'Income', 'VitD_levels', 'Doc_visits', 'Initial_days', 'TotalCharge']]),
                         columns=['Age', 'Income', 'VitD_levels', 'Doc_visits', 'Initial_days', 'TotalCharge'])

In [None]:
# Visualizing Outliers
sns.boxplot(df_scaled)
plt.title('Numeric Variables')
plt.ylabel('Count(Normalized)')
plt.xlabel('Explanatory Variables');

In [None]:
# Removing Outliers using z-scores
import scipy.stats as stats

#Creating New columns for z-scores
df['Income_z_Scores'] = stats.zscore(df['Income'])
df['VitD_level_z_Scores'] = stats.zscore(df['VitD_levels'])

# Removing records with z-scores -3 < z < 3
df = df[(df['Income_z_Scores'] > -3) & (df['Income_z_Scores'] < 3) & (df['VitD_level_z_Scores'] > -3) & (df['VitD_level_z_Scores'] < 3)]
df.drop(['Income_z_Scores', 'VitD_level_z_Scores'], axis=1, inplace=True)

In [None]:
df.head()

# ▶ Data Wrangling  

In [None]:
df.dtypes

In [None]:
# Using .get_dummies() to re-express nominal variables
df = pd.get_dummies(data=df, columns=['Area','Marital','Gender', 'Initial_admin','HighBlood',
                                      'Stroke','Overweight', 'Arthritis','Diabetes', 'Hyperlipidemia', 
                                      'BackPain', 'Anxiety', 'Allergic_rhinitis','Reflux_esophagitis', 
                                      'Asthma','Services'], drop_first=True)

In [None]:
# Using Ordinal Encoding for ordinal variables
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder()
complication_encoded = enc.fit_transform(df[['Complication_risk']])
df['Complication_risk'] = complication_encoded

In [None]:
df.head()

In [None]:
# Exporting cleaned dataset
cleaned_dataset = df.to_csv('cleaned_dataset.csv', index = False)

# ▶ Classification Analysis using k-Nearest Neighbor

In [None]:
# Label and Feature Assignment
X = df.drop('HighBlood_Yes', axis = 1).values
y = df['HighBlood_Yes'].values

In [None]:
# Splitting data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0, stratify=y)

In [None]:
print(f'The shape of the X-training dataset is: {X_train.shape}')
print(f'The shape of the X-testing dataset is: {X_test.shape}')
print(f'The shape of the Y-training dataset is: {y_train.shape}')
print(f'The shape of the Y-testing dataset is: {y_test.shape}')

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train,y_train)

In [None]:
print(f'The prediction array is: {knn.predict(X_test)}')

In [None]:
print(f'The prediction score for the model is: {knn.score(X_test, y_test)}')

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

print(f'Confusion Matrix: \n{confusion_matrix(y_test,knn.predict(X_test))}')
print(f'\nClassification Report for the Model: \n{classification_report(y_test, knn.predict(X_test))}')
print(f'\nThe area under the curve (AUC) is: {roc_auc_score(y_test, knn.predict(X_test))}')
