In [None]:
#---------------------------------------------------------------------------------#
# Graduate Program in Software                                                    #
# SEIS 763: Machine Learning                                                      #
# Group project                                                                   #
# Authors: 
# -- Yann Mulonda   
# -- David Vdvick
# --
# --                                                                              #
#---------------------------------------------------------------------------------#
import numpy as np
import pandas as pd
from scipy.special import expit
from sklearn import linear_model
import matplotlib.pyplot as plot
from sklearn import preprocessing
from sklearn.linear_model import Lasso
from pandas.plotting import scatter_matrix
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import confusion_matrix, r2_score, mean_squared_error
from sklearn.impute import SimpleImputer

In [None]:
# get patients data from csv file
# set header to 0 because to exclude the header column
cellDNA = pd.read_csv("./healthcare_data.csv")
# show cell DNA data
cellDNA.info()

In [None]:
# show data info
cellDNA.describe()

In [None]:
# show top 10 rows
cellDNA.head(10)

In [None]:
cellDNA.hist(bins=20, figsize=(16,8))

In [None]:
# get the Y : depedent
cellDNA_Y = cellDNA["stroke"]

# get the X numeric dtat
cellDNA_X_numeric = cellDNA[["age", "avg_glucose_level", "bmi"]]

# get X non data
cellDNA_X_non_numeric = cellDNA[["gender", "work_type", "Residence_type", "smoking_status", "ever_married"]]

# get X binary
cellDNA_X_binary = cellDNA[["hypertension", "heart_disease"]]

# plot not numeric data
attributes = ["age", "hypertension", "heart_disease", "avg_glucose_level", "bmi", "stroke"]
scatter_matrix(cellDNA[attributes], figsize = (16,8))

In [None]:
# standardized numeric the data
cellDNA_X_numeric_standard = preprocessing.scale(cellDNA_X_numeric, axis=0)
print(cellDNA_X_numeric_standard)

In [None]:
# get mean of numeric data
# mean_numeric_data = cellDNA_X_numeric_standard.mean()
# print("Mean is", mean_numeric_data, sep= " : ")

In [None]:
# standardized non numeric the data
cellDNA_X_non_numeric_stadard = cellDNA_X_non_numeric.select_dtypes(include=[object]).copy()
cellDNA_X_non_numeric_stadard.head()

In [None]:
cellDNA_X_non_numeric_stadard_onehot = cellDNA_X_non_numeric_stadard.copy()
cellDNA_X_non_numeric_stadard_onehot = pd.get_dummies(cellDNA_X_non_numeric_stadard, columns=["gender", "work_type", "Residence_type", "smoking_status", "ever_married"], prefix = ["gender", "work_type", "Residence_type", "smoking_status", "ever_married"], drop_first = True)

print(cellDNA_X_non_numeric_stadard_onehot)

In [None]:
# standardized non numeric the data
cellDNA_X_binary_standard_onehot = pd.get_dummies(cellDNA_X_binary, columns= ["hypertension", "heart_disease"], prefix = ["hypertension", "heart_disease"], drop_first = True)

print(cellDNA_X_binary_standard_onehot)

In [None]:
# convert the numer data to data frame
cellDNA_X_numeric_standard_dataframe = pd.DataFrame(cellDNA_X_numeric_standard)
cellDNA_X_numeric_standard_dataframe.columns = ["age", "avg_glucose_level", "bmi"]
print(cellDNA_X_numeric_standard_dataframe.head())

In [None]:
# put all the X data back to together as one dataframe
cellDNA_X_All = pd.concat([cellDNA_X_numeric_standard_dataframe, cellDNA_X_non_numeric_stadard_onehot, cellDNA_X_binary_standard_onehot], axis = 1)

print(cellDNA_X_All)

In [None]:
# Impute missing inputs

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
cellDNA_X_All = imp.fit_transform(cellDNA_X_All)
print(cellDNA_X_All)

In [None]:
def print_metrics(y_true, y_predict):
    print(f'R2 Score: {r2_score(y_true, y_predict)}')
    print(f'MSE: {mean_squared_error(y_true, y_predict)}')
    
    tn, fp, fn, tp = confusion_matrix(y_true, y_predict).ravel()
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    recall = (tp) / (tp + fn)
    precision = (tp) / (tp + fp)
    print(f'Accuracy: {accuracy}')
    print(f'Recall: {recall}')
    print(f'Precision: {precision}')
    print()

X_train, X_test, y_train, y_test = train_test_split(cellDNA_X_All, cellDNA_Y, test_size=.4)
model = svm.SVC(gamma='auto', kernel='rbf', C=10, random_state=4331)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

print_metrics(y_test, y_predict)