In [None]:
#---------------------------------------------------------------------------------#
# Graduate Program in Software                                                    #
# SEIS 763: Machine Learning                                                      #
# Group project                                                                   #
# Authors: Yann Mulonda -- David Vedvick -- Jeevanlal A M Nair -- Jacob Sevening  #
#---------------------------------------------------------------------------------#
import numpy as np
import pandas as pd
from scipy.special import expit
from sklearn import linear_model
import matplotlib.pyplot as plot
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, mean_squared_error, r2_score
from sklearn.linear_model import Lasso, LassoLarsCV
from sklearn.impute import SimpleImputer
from pandas.plotting import scatter_matrix
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import svm

In [None]:
# get patients data from csv file
# set header to 0 because to exclude the header column
cellDNA = pd.read_csv("./healthcare_data.csv")
# show cell DNA data
cellDNA.info()

In [None]:
# show data info
cellDNA.describe()

In [None]:
# show top 10 rows
cellDNA.head(10)

In [None]:
cellDNA.hist(bins=20, figsize=(16,8))

In [None]:
# get the Y : depedent
cellDNA_Y = cellDNA["stroke"]
# get the X numeric dtat
cellDNA_X_numeric = cellDNA[["age", "avg_glucose_level", "bmi"]]
# get X non data
cellDNA_X_non_numeric = cellDNA[["gender", "work_type", "Residence_type", "smoking_status", "ever_married"]]
# get X binary
cellDNA_X_binary = cellDNA[["hypertension", "heart_disease"]]
# plot not numeric data
attributes = ["age", "hypertension", "heart_disease", "avg_glucose_level", "bmi", "stroke"]
scatter_matrix(cellDNA[attributes], figsize = (16,8))

In [None]:
# standardized numeric the data
cellDNA_X_numeric_standard = preprocessing.scale(cellDNA_X_numeric, axis=0)
print(cellDNA_X_numeric_standard)

In [None]:
# standardized non numeric the data
cellDNA_X_non_numeric_stadard = cellDNA_X_non_numeric.select_dtypes(include=[object]).copy()
cellDNA_X_non_numeric_stadard.head()

In [None]:
cellDNA_X_non_numeric_stadard_onehot = cellDNA_X_non_numeric_stadard.copy()
cellDNA_X_non_numeric_stadard_onehot = pd.get_dummies(cellDNA_X_non_numeric_stadard, columns=["gender", "work_type", "Residence_type", "smoking_status", "ever_married"], prefix = ["gender", "work_type", "Residence_type", "smoking_status", "ever_married"], drop_first = True)
print(cellDNA_X_non_numeric_stadard_onehot)

In [None]:
# standardized non numeric the data
cellDNA_X_binary_standard_onehot = pd.get_dummies(cellDNA_X_binary, columns= ["hypertension", "heart_disease"], prefix = ["hypertension", "heart_disease"], drop_first = True)
print(cellDNA_X_binary_standard_onehot)

In [None]:
# convert the numer data to data frame
cellDNA_X_numeric_standard_dataframe = pd.DataFrame(cellDNA_X_numeric_standard)
cellDNA_X_numeric_standard_dataframe.columns = ["age", "avg_glucose_level", "bmi"]
print(cellDNA_X_numeric_standard_dataframe.head())

In [None]:
# put all the X data back to together as one dataframe
cellDNA_X_All = pd.concat([cellDNA_X_numeric_standard_dataframe, cellDNA_X_non_numeric_stadard_onehot, cellDNA_X_binary_standard_onehot], axis = 1)
print(cellDNA_X_All)

In [None]:
# Impute missing inputs using the simpleImputer method
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
cellDNA_X_All = imp.fit_transform(cellDNA_X_All)


In [None]:
# Building models 
# next, let's train the model using the training sets
X_train, X_test, y_train, y_test = train_test_split(cellDNA_X_All, cellDNA_Y, train_size=.6, random_state=4331)
#---------------------------------------------------------------------#
# 1. let's create linear regression object
#---------------------------------------------------------------------#
def linearModel(y_value, x_value):
    regr_object = linear_model.LinearRegression()
    # next, let's train the model using the training sets
    trainedmodel = regr_object.fit(x_value, y_value)
    # the regression coefficients (thetas)
    yint = regr_object.intercept_
    print("Number of columns", len(cellDNA_X_All), sep=' = ')
    print("Y intercept: ", yint, sep=' \n')
    # showing the interpretation of the coefiicients
    coefficients = regr_object.coef_
    print("Number of coefficients: ", len(coefficients))
    print("Coefficients: ", coefficients, sep='\n')


In [None]:
# caaling function for train value
linearModel(y_train, X_train)

In [None]:
# based on the coefficients, we 
#Let’s begin the search with a box plot.
get_ipython().run_line_magic('matplotlib', 'inline')
cellDNA_X_numeric_standard_dataframe.plot.box(figsize=(18,4))

In [None]:
#---------------------------------------------------------------------#
# 2. let's create linear regression object 
# considering corss validation - 10 
#---------------------------------------------------------------------#
def lassModel(y_value, x_value):
    lasso_cv_model = LassoLarsCV(eps=0.1, max_n_alphas=100, cv=10)
    # next, let's train the model using the training sets
    trainedmodel = lasso_cv_model.fit(x_value, y_value)
    # the regression coefficients (thetas)
    yint_lasso = lasso_cv_model.intercept_
    coefficients_lasso = lasso_cv_model.coef_
    # let's get the prediction
    predictions = lasso_cv_model.predict(x_value)
    RMSE = np.sqrt(mean_squared_error(y_value, predictions))
    r2 = r2_score(y_value, predictions)
    model_result = {
        "yint_lasso": yint_lasso,
        "coefficients_lasso": coefficients_lasso,
        "predictions": predictions,
        "RMSE": RMSE,
        "r2": r2
        }
    return model_result

In [None]:
def svmModel(y_train, y_test, x_train, x_test):
    model = svm.SVC(
        gamma='auto',
        kernel='rbf',
        C=10,
        class_weight={0:1, 1:95})
    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)

    RMSE = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
    model_result = {
        "matrix": confusion_matrix(y_test, y_predict),
        "RMSE": RMSE,
        "r2": r2,
        "accuracy": (tp + tn) / (tp + tn + fp + fn),
        "recall": (tp) / (tp + fn),
        "precision": (tp) / (tp + fp),
    }

    return model_result

In [None]:
# calling the function for test value
lassModel(y_test, X_test)

In [None]:
# caaling function for train value
lassModel(y_train, X_train)

In [None]:
# caaling function for value
lassModel(cellDNA_Y, cellDNA_X_All)

In [None]:
svmModel(y_train, y_test, X_train, X_test)