In [34]:
import importlib
from table import Table
from linear_regressor import LinearRegressorModel
from statistics import mean
import evaluations
import helpers
import datetime

In [35]:
stroke_data_table = Table()
stroke_data_table.load_file("full_data.csv")
lin_reg = LinearRegressorModel()
attr_headers = ["gender", "age", "hypertension", "heart_disease", "ever_married", "work_type", "Residence_type", "avg_glucose_level", "bmi", "smoking_status"]

new_gender = helpers.discretize_gender(stroke_data_table.get_column(0))
helpers.replace_col_vals(stroke_data_table, new_gender, "gender")

new_married = helpers.discretize_married(stroke_data_table.get_column(4))
helpers.replace_col_vals(stroke_data_table, new_married, "ever_married")

new_work_type = helpers.discretize_work_type(stroke_data_table.get_column(5))
helpers.replace_col_vals(stroke_data_table, new_work_type, "work_type")

new_residence_type = helpers.discretize_residence_type(stroke_data_table.get_column(6))
helpers.replace_col_vals(stroke_data_table, new_residence_type, "Residence_type")

new_smoking_status = helpers.discretize_smoking_status(stroke_data_table.get_column(-2))
helpers.replace_col_vals(stroke_data_table, new_smoking_status, "smoking_status")

stroke_data_table.convert_to_numeric(stroke_data_table.column_names)
attr_data = stroke_data_table.make_sub_table(stroke_data_table, attr_headers)
class_labels = stroke_data_table.get_column(-1)

t0 = datetime.datetime.now()
X_train, X_test, y_train, y_test = evaluations.train_test_split(attr_data, class_labels, test_size=.33)
t1 = datetime.datetime.now() - t0
print("Time elapsed for train_test_split method:", t1) 

print("Length of train set:", len(y_train))
print("Length of test set:", len(y_test))

t0 = datetime.datetime.now()
lin_reg.fit(X_train, y_train)
t1 = datetime.datetime.now() - t0
print("Time elapsed for fit method:", t1) 

print("Slope:", lin_reg.slope)
print("Intercept:", lin_reg.intercept)

t0 = datetime.datetime.now()
predictions = lin_reg.predict(X_test)
t1 = datetime.datetime.now() - t0
print("Time elapsed for predict method:", t1) 
print(mean(predictions))

bin_preds = []
for pred in predictions:
    if pred < mean(predictions):
        bin_preds.append(0)
    else:
        bin_preds.append(1)


Time elapsed for train_test_split method: 0:00:00.003458
Length of train set: 3337
Length of test set: 1644
Time elapsed for fit method: 0:00:00.002910
Slope: -0.01946017565783004
Intercept: 0.059583632447953734
Time elapsed for predict method: 0:00:00.000701
0.04818451495495475


***Continuous Predictions Results***

In [36]:
print("Here are the continuous predictions for the dataset based on the linear regressor\n", predictions)

Here are the continuous predictions for the dataset based on the linear regressor
 [0.04012345679012369, 0.04012345679012369, 0.04012345679012369, 0.059583632447953734, 0.059583632447953734, 0.04012345679012369, 0.04012345679012369, 0.04012345679012369, 0.059583632447953734, 0.04012345679012369, 0.04012345679012369, 0.04012345679012369, 0.04012345679012369, 0.04012345679012369, 0.059583632447953734, 0.04012345679012369, 0.059583632447953734, 0.059583632447953734, 0.059583632447953734, 0.04012345679012369, 0.04012345679012369, 0.059583632447953734, 0.04012345679012369, 0.04012345679012369, 0.059583632447953734, 0.059583632447953734, 0.04012345679012369, 0.059583632447953734, 0.059583632447953734, 0.059583632447953734, 0.04012345679012369, 0.04012345679012369, 0.04012345679012369, 0.059583632447953734, 0.04012345679012369, 0.04012345679012369, 0.04012345679012369, 0.04012345679012369, 0.04012345679012369, 0.059583632447953734, 0.04012345679012369, 0.04012345679012369, 0.05958363244795373

***Binary Classification Results***

In [37]:
print("Here are the binary predictions based on the threshold which I determined to be the average value in the list\n", bin_preds)
accuracy = evaluations.compute_accuracy(y_test, bin_preds)
print("accuracy:", accuracy)
precision = evaluations.compute_precision(y_test, bin_preds)
print("precision:", precision)
recall = evaluations.compute_recall(y_test, bin_preds)
print("recall:", recall)
f1 = evaluations.compute_f1(y_test, bin_preds)
print("f1", f1)
labels = [0, 1]
confusion_matrix = evaluations.confusion_matrix(y_test, bin_preds, labels.copy())
evaluations.print_confusion_matrix("Linear Regression Model", labels.copy(), confusion_matrix)

Here are the binary predictions based on the threshold which I determined to be the average value in the list
 [0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 