In [1]:
import getpass
import psycopg2
import numpy as np
from sqlalchemy import create_engine
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

In [2]:
# Getting Data
username = input("What is your Postgres Username? (postgres by default)")

What is your Postgres Username? (postgres by default)postgres


In [3]:
# Getting Data
password = getpass.getpass(prompt= "What is your Postgres Password?")

What is your Postgres Password?········


In [4]:
# Getting Data
port = input("What is your Postgres Port number?")

What is your Postgres Port number?5432


In [5]:
# Getting Data
conn = f'postgresql://{username}:{password}@localhost:{port}/SanAntonio_Stroke_Pred'.format(username, password, port)

In [6]:
# Connect
engine = create_engine(conn)

In [15]:
# Read Data
medical_df = pd.read_sql_query('SELECT personal."Identifier", medical."Age", \
                           medical."Gender", personal."Work_Type", personal."Residence_Type", \
                           personal."Ever_Married", medical."Hypertension", medical."Heart_Disease",\
                           medical."Avg_Glucose_Lvl", medical."BMI", medical."Smoker", personal."Stroke"\
                                FROM personal\
                                INNER JOIN medical\
                                ON personal."Identifier" = medical."Identifier";', conn)

medical_df.head(10)

Unnamed: 0,Identifier,Age,Gender,Work_Type,Residence_Type,Ever_Married,Hypertension,Heart_Disease,Avg_Glucose_Lvl,BMI,Smoker,Stroke
0,1,67,Male,Private,Urban,Yes,0,1,228.69,36.6,Former,1
1,2,61,Female,Self-employed,Rural,Yes,0,0,202.21,,Never,1
2,3,80,Male,Private,Rural,Yes,0,1,105.92,32.5,Never,1
3,4,49,Female,Private,Urban,Yes,0,0,171.23,34.4,Current,1
4,5,79,Female,Self-employed,Rural,Yes,1,0,174.12,24.0,Never,1
5,6,81,Male,Private,Urban,Yes,0,0,186.21,29.0,Former,1
6,7,74,Male,Private,Rural,Yes,1,1,70.09,27.4,Never,1
7,8,69,Female,Private,Urban,No,0,0,94.39,22.8,Never,1
8,9,59,Female,Private,Rural,Yes,0,0,76.15,,Unknown,1
9,10,78,Female,Private,Urban,Yes,0,0,58.57,24.2,Unknown,1


In [16]:
# Drop ID# column
medical_df.drop(columns=['Identifier'], inplace=True)
print(medical_df.shape)
medical_df.head()

(5109, 11)


Unnamed: 0,Age,Gender,Work_Type,Residence_Type,Ever_Married,Hypertension,Heart_Disease,Avg_Glucose_Lvl,BMI,Smoker,Stroke
0,67,Male,Private,Urban,Yes,0,1,228.69,36.6,Former,1
1,61,Female,Self-employed,Rural,Yes,0,0,202.21,,Never,1
2,80,Male,Private,Rural,Yes,0,1,105.92,32.5,Never,1
3,49,Female,Private,Urban,Yes,0,0,171.23,34.4,Current,1
4,79,Female,Self-employed,Rural,Yes,1,0,174.12,24.0,Never,1


In [18]:
# Create array to store diabetes status based on glucose level
glucose_status = []

# Add diabetes status to array based on glucose level with for loop
for g in medical_df['Avg_Glucose_Lvl']:
    if g > 125.0:
        glucose_status.append('Diabetic')
    if g > 99.0 and g <= 125.0:
        glucose_status.append('Prediabetic')
    if g <= 99.0:
        glucose_status.append('Normal')
        
# Ensure array is equivalent in length to medical_df = 5109 rows
print(len(glucose_status))
glucose_status

5109


['Diabetic',
 'Diabetic',
 'Prediabetic',
 'Diabetic',
 'Diabetic',
 'Diabetic',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Prediabetic',
 'Prediabetic',
 'Diabetic',
 'Diabetic',
 'Diabetic',
 'Diabetic',
 'Diabetic',
 'Normal',
 'Diabetic',
 'Diabetic',
 'Diabetic',
 'Diabetic',
 'Diabetic',
 'Prediabetic',
 'Prediabetic',
 'Prediabetic',
 'Diabetic',
 'Diabetic',
 'Diabetic',
 'Diabetic',
 'Normal',
 'Diabetic',
 'Diabetic',
 'Normal',
 'Normal',
 'Diabetic',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Diabetic',
 'Normal',
 'Diabetic',
 'Diabetic',
 'Prediabetic',
 'Prediabetic',
 'Prediabetic',
 'Normal',
 'Normal',
 'Normal',
 'Diabetic',
 'Prediabetic',
 'Diabetic',
 'Diabetic',
 'Normal',
 'Diabetic',
 'Diabetic',
 'Diabetic',
 'Diabetic',
 'Normal',
 'Normal',
 'Normal',
 'Prediabetic',
 'Prediabetic',
 'Diabetic',
 'Prediabetic',
 'Normal',
 'Normal',
 'Normal',
 'Diabetic',
 'Prediabetic',
 'Diabetic',
 'Normal',
 'Normal',
 'Diabetic',
 'Prediabet

In [22]:
# Put array into a df
glucose_status_df = pd.DataFrame(glucose_status, columns=['Glucose_Status'])
glucose_status_df.head(10)

Unnamed: 0,Glucose_Status
0,Diabetic
1,Diabetic
2,Prediabetic
3,Diabetic
4,Diabetic
5,Diabetic
6,Normal
7,Normal
8,Normal
9,Normal


In [23]:
# Ensure no null values 
glucose_status_df.isnull().sum()

Glucose_Status    0
dtype: int64

In [24]:
# Add glucose_status diabetes values to medical_df
medical_df = medical_df.merge(glucose_status_df, left_index=True, right_index=True)
medical_df.head(10)

Unnamed: 0,Age,Gender,Work_Type,Residence_Type,Ever_Married,Hypertension,Heart_Disease,Avg_Glucose_Lvl,BMI,Smoker,Stroke,Glucose_Status
0,67,Male,Private,Urban,Yes,0,1,228.69,36.6,Former,1,Diabetic
1,61,Female,Self-employed,Rural,Yes,0,0,202.21,,Never,1,Diabetic
2,80,Male,Private,Rural,Yes,0,1,105.92,32.5,Never,1,Prediabetic
3,49,Female,Private,Urban,Yes,0,0,171.23,34.4,Current,1,Diabetic
4,79,Female,Self-employed,Rural,Yes,1,0,174.12,24.0,Never,1,Diabetic
5,81,Male,Private,Urban,Yes,0,0,186.21,29.0,Former,1,Diabetic
6,74,Male,Private,Rural,Yes,1,1,70.09,27.4,Never,1,Normal
7,69,Female,Private,Urban,No,0,0,94.39,22.8,Never,1,Normal
8,59,Female,Private,Rural,Yes,0,0,76.15,,Unknown,1,Normal
9,78,Female,Private,Urban,Yes,0,0,58.57,24.2,Unknown,1,Normal


In [25]:
# Get a list of categorical columns
categorical_columns = medical_df.dtypes[medical_df.dtypes=='object'].index.tolist()
categorical_columns

['Gender',
 'Work_Type',
 'Residence_Type',
 'Ever_Married',
 'Smoker',
 'Glucose_Status']

In [26]:
# Create OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit & transform OneHotEncoder using categorical columns
encode_df = pd.DataFrame(enc.fit_transform(medical_df[categorical_columns]))

# Add column names
encode_df.columns = enc.get_feature_names(categorical_columns)
print(encode_df.shape)
encode_df.head(10)

(5109, 18)




Unnamed: 0,Gender_Female,Gender_Male,Work_Type_Govt_job,Work_Type_Never_worked,Work_Type_Private,Work_Type_Self-employed,Work_Type_children,Residence_Type_Rural,Residence_Type_Urban,Ever_Married_No,Ever_Married_Yes,Smoker_Current,Smoker_Former,Smoker_Never,Smoker_Unknown,Glucose_Status_Diabetic,Glucose_Status_Normal,Glucose_Status_Prediabetic
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
5,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
6,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
7,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
8,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
9,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [27]:
# Get encode_df info
encode_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5109 entries, 0 to 5108
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Gender_Female               5109 non-null   float64
 1   Gender_Male                 5109 non-null   float64
 2   Work_Type_Govt_job          5109 non-null   float64
 3   Work_Type_Never_worked      5109 non-null   float64
 4   Work_Type_Private           5109 non-null   float64
 5   Work_Type_Self-employed     5109 non-null   float64
 6   Work_Type_children          5109 non-null   float64
 7   Residence_Type_Rural        5109 non-null   float64
 8   Residence_Type_Urban        5109 non-null   float64
 9   Ever_Married_No             5109 non-null   float64
 10  Ever_Married_Yes            5109 non-null   float64
 11  Smoker_Current              5109 non-null   float64
 12  Smoker_Former               5109 non-null   float64
 13  Smoker_Never                5109 

In [28]:
# Drop redundant columns
encode_df.drop(columns=['Gender_Female', 'Ever_Married_No', 'Residence_Type_Rural'], inplace=True)
encode_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5109 entries, 0 to 5108
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Gender_Male                 5109 non-null   float64
 1   Work_Type_Govt_job          5109 non-null   float64
 2   Work_Type_Never_worked      5109 non-null   float64
 3   Work_Type_Private           5109 non-null   float64
 4   Work_Type_Self-employed     5109 non-null   float64
 5   Work_Type_children          5109 non-null   float64
 6   Residence_Type_Urban        5109 non-null   float64
 7   Ever_Married_Yes            5109 non-null   float64
 8   Smoker_Current              5109 non-null   float64
 9   Smoker_Former               5109 non-null   float64
 10  Smoker_Never                5109 non-null   float64
 11  Smoker_Unknown              5109 non-null   float64
 12  Glucose_Status_Diabetic     5109 non-null   float64
 13  Glucose_Status_Normal       5109 

In [29]:
# Merge encoded df with medical df
medical_df = medical_df.merge(encode_df, left_index=True, right_index=True).drop(categorical_columns, axis=1)
print(medical_df.shape)
medical_df.head(10)

(5109, 21)


Unnamed: 0,Age,Hypertension,Heart_Disease,Avg_Glucose_Lvl,BMI,Stroke,Gender_Male,Work_Type_Govt_job,Work_Type_Never_worked,Work_Type_Private,...,Work_Type_children,Residence_Type_Urban,Ever_Married_Yes,Smoker_Current,Smoker_Former,Smoker_Never,Smoker_Unknown,Glucose_Status_Diabetic,Glucose_Status_Normal,Glucose_Status_Prediabetic
0,67,0,1,228.69,36.6,1,1.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,61,0,0,202.21,,1,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,80,0,1,105.92,32.5,1,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,49,0,0,171.23,34.4,1,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,79,1,0,174.12,24.0,1,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
5,81,0,0,186.21,29.0,1,1.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
6,74,1,1,70.09,27.4,1,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
7,69,0,0,94.39,22.8,1,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
8,59,0,0,76.15,,1,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
9,78,0,0,58.57,24.2,1,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [30]:
# Create SimpleImputer instance to replace missing BMI feature values with median BMI
imputer = SimpleImputer(strategy='mean')

# Fit SimpleImputer & transform data
med_transformed = imputer.fit_transform(medical_df)

In [31]:
# Add SimpleImputer outcome to dataframe
med_df_transformed = pd.DataFrame(med_transformed, columns=medical_df.columns)
print(med_df_transformed.shape)
med_df_transformed.head(10)

(5109, 21)


Unnamed: 0,Age,Hypertension,Heart_Disease,Avg_Glucose_Lvl,BMI,Stroke,Gender_Male,Work_Type_Govt_job,Work_Type_Never_worked,Work_Type_Private,...,Work_Type_children,Residence_Type_Urban,Ever_Married_Yes,Smoker_Current,Smoker_Former,Smoker_Never,Smoker_Unknown,Glucose_Status_Diabetic,Glucose_Status_Normal,Glucose_Status_Prediabetic
0,67.0,0.0,1.0,228.69,36.6,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,61.0,0.0,0.0,202.21,28.89456,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,80.0,0.0,1.0,105.92,32.5,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,49.0,0.0,0.0,171.23,34.4,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,79.0,1.0,0.0,174.12,24.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
5,81.0,0.0,0.0,186.21,29.0,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
6,74.0,1.0,1.0,70.09,27.4,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
7,69.0,0.0,0.0,94.39,22.8,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
8,59.0,0.0,0.0,76.15,28.89456,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
9,78.0,0.0,0.0,58.57,24.2,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [32]:
med_df_transformed.isnull().sum()

Age                           0
Hypertension                  0
Heart_Disease                 0
Avg_Glucose_Lvl               0
BMI                           0
Stroke                        0
Gender_Male                   0
Work_Type_Govt_job            0
Work_Type_Never_worked        0
Work_Type_Private             0
Work_Type_Self-employed       0
Work_Type_children            0
Residence_Type_Urban          0
Ever_Married_Yes              0
Smoker_Current                0
Smoker_Former                 0
Smoker_Never                  0
Smoker_Unknown                0
Glucose_Status_Diabetic       0
Glucose_Status_Normal         0
Glucose_Status_Prediabetic    0
dtype: int64

In [33]:
# Create feature & target datasets
X = med_df_transformed.drop(columns=['Stroke'])
y = med_df_transformed['Stroke']
print(X.shape)
print(y.shape)

(5109, 20)
(5109,)


In [34]:
# Create empty arrays to store ML results
recalls = []
accuracies = []
precisions = []

# This will take a while. Feel free to adjust range to sample code. Feel free to review exported spreadsheets in github.
for i in range(1000):
    
    # Create training + testing data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
    
    # Oversample positive stroke cases in training set
    ros = RandomOverSampler()
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
    
    # Scale feature data
    scaler = StandardScaler()
    scaler.fit(X_train_resampled)
    X_train_scaled = scaler.transform(X_train_resampled)
    X_test_scaled = scaler.transform(X_test)
    
    # Create SVC model
    svc_model = SVC(kernel='linear')
    svc_model = svc_model.fit(X_train_scaled, y_train_resampled)
    
    # Evaluate SVC model
    y_pred = svc_model.predict(X_test_scaled)
    
    # Add recall results to list
    recalls.append(recall_score(y_test, y_pred))
    
    # Add precision results to list
    precisions.append(precision_score(y_test, y_pred))
    
    # Add accuracy results to list
    accuracies.append(accuracy_score(y_test, y_pred))
    
    print("Current progress:", i)

Current progress: 0
Current progress: 1
Current progress: 2
Current progress: 3
Current progress: 4
Current progress: 5
Current progress: 6
Current progress: 7
Current progress: 8
Current progress: 9
Current progress: 10
Current progress: 11
Current progress: 12
Current progress: 13
Current progress: 14
Current progress: 15
Current progress: 16
Current progress: 17
Current progress: 18
Current progress: 19
Current progress: 20
Current progress: 21
Current progress: 22
Current progress: 23
Current progress: 24
Current progress: 25
Current progress: 26
Current progress: 27
Current progress: 28
Current progress: 29
Current progress: 30
Current progress: 31
Current progress: 32
Current progress: 33
Current progress: 34
Current progress: 35
Current progress: 36
Current progress: 37
Current progress: 38
Current progress: 39
Current progress: 40
Current progress: 41
Current progress: 42
Current progress: 43
Current progress: 44
Current progress: 45
Current progress: 46
Current progress: 47
Cu

Current progress: 378
Current progress: 379
Current progress: 380
Current progress: 381
Current progress: 382
Current progress: 383
Current progress: 384
Current progress: 385
Current progress: 386
Current progress: 387
Current progress: 388
Current progress: 389
Current progress: 390
Current progress: 391
Current progress: 392
Current progress: 393
Current progress: 394
Current progress: 395
Current progress: 396
Current progress: 397
Current progress: 398
Current progress: 399
Current progress: 400
Current progress: 401
Current progress: 402
Current progress: 403
Current progress: 404
Current progress: 405
Current progress: 406
Current progress: 407
Current progress: 408
Current progress: 409
Current progress: 410
Current progress: 411
Current progress: 412
Current progress: 413
Current progress: 414
Current progress: 415
Current progress: 416
Current progress: 417
Current progress: 418
Current progress: 419
Current progress: 420
Current progress: 421
Current progress: 422
Current pr

Current progress: 751
Current progress: 752
Current progress: 753
Current progress: 754
Current progress: 755
Current progress: 756
Current progress: 757
Current progress: 758
Current progress: 759
Current progress: 760
Current progress: 761
Current progress: 762
Current progress: 763
Current progress: 764
Current progress: 765
Current progress: 766
Current progress: 767
Current progress: 768
Current progress: 769
Current progress: 770
Current progress: 771
Current progress: 772
Current progress: 773
Current progress: 774
Current progress: 775
Current progress: 776
Current progress: 777
Current progress: 778
Current progress: 779
Current progress: 780
Current progress: 781
Current progress: 782
Current progress: 783
Current progress: 784
Current progress: 785
Current progress: 786
Current progress: 787
Current progress: 788
Current progress: 789
Current progress: 790
Current progress: 791
Current progress: 792
Current progress: 793
Current progress: 794
Current progress: 795
Current pr

In [35]:
recalls

[0.94,
 0.84,
 0.88,
 0.8,
 0.72,
 0.88,
 0.78,
 0.88,
 0.74,
 0.84,
 0.8,
 0.82,
 0.82,
 0.78,
 0.7,
 0.72,
 0.88,
 0.8,
 0.8,
 0.8,
 0.76,
 0.8,
 0.92,
 0.86,
 0.84,
 0.68,
 0.7,
 0.84,
 0.8,
 0.82,
 0.7,
 0.86,
 0.84,
 0.72,
 0.9,
 0.88,
 0.82,
 0.68,
 0.88,
 0.8,
 0.76,
 0.82,
 0.84,
 0.88,
 0.92,
 0.86,
 0.7,
 0.82,
 0.8,
 0.86,
 0.8,
 0.88,
 0.9,
 0.86,
 0.9,
 0.8,
 0.76,
 0.8,
 0.84,
 0.86,
 0.86,
 0.82,
 0.82,
 0.86,
 0.84,
 0.78,
 0.9,
 0.84,
 0.86,
 0.76,
 0.92,
 0.78,
 0.92,
 0.84,
 0.84,
 0.84,
 0.86,
 0.8,
 0.8,
 0.88,
 0.7,
 0.76,
 0.86,
 0.86,
 0.78,
 0.8,
 0.82,
 0.74,
 0.78,
 0.84,
 0.76,
 0.84,
 0.78,
 0.72,
 0.82,
 0.76,
 0.8,
 0.78,
 0.84,
 0.82,
 0.82,
 0.78,
 0.8,
 0.72,
 0.94,
 0.82,
 0.78,
 0.84,
 0.82,
 0.82,
 0.78,
 0.74,
 0.78,
 0.78,
 0.76,
 0.9,
 0.74,
 0.72,
 0.8,
 0.74,
 0.72,
 0.74,
 0.88,
 0.72,
 0.82,
 0.86,
 0.78,
 0.78,
 0.8,
 0.86,
 0.76,
 0.76,
 0.88,
 0.88,
 0.86,
 0.8,
 0.8,
 0.9,
 0.84,
 0.8,
 0.76,
 0.76,
 0.76,
 0.74,
 0.8,
 0.86,
 0.84,
 0.84

In [36]:
recalls_df = pd.DataFrame(recalls, columns=['Recall'])
print(recalls_df.shape)
recalls_df.head(10)

(1000, 1)


Unnamed: 0,Recall
0,0.94
1,0.84
2,0.88
3,0.8
4,0.72
5,0.88
6,0.78
7,0.88
8,0.74
9,0.84


In [37]:
accuracies

[0.7426614481409002,
 0.7123287671232876,
 0.7338551859099804,
 0.7318982387475538,
 0.7592954990215264,
 0.726027397260274,
 0.7436399217221135,
 0.7191780821917808,
 0.7123287671232876,
 0.726027397260274,
 0.7250489236790607,
 0.7142857142857143,
 0.7152641878669276,
 0.7377690802348337,
 0.7485322896281801,
 0.7279843444227005,
 0.7270058708414873,
 0.7279843444227005,
 0.7465753424657534,
 0.7524461839530333,
 0.7240704500978473,
 0.7152641878669276,
 0.7407045009784736,
 0.7455968688845401,
 0.7270058708414873,
 0.735812133072407,
 0.7152641878669276,
 0.7328767123287672,
 0.7397260273972602,
 0.7201565557729941,
 0.7318982387475538,
 0.7416829745596869,
 0.7338551859099804,
 0.7270058708414873,
 0.7318982387475538,
 0.7446183953033269,
 0.7407045009784736,
 0.7485322896281801,
 0.7162426614481409,
 0.735812133072407,
 0.7299412915851272,
 0.7240704500978473,
 0.7113502935420744,
 0.7221135029354208,
 0.7142857142857143,
 0.7270058708414873,
 0.7250489236790607,
 0.71232876712328

In [38]:
accuracies_df = pd.DataFrame(accuracies, columns=['Accuracy'])
print(accuracies_df.shape)
accuracies_df.head(10)

(1000, 1)


Unnamed: 0,Accuracy
0,0.742661
1,0.712329
2,0.733855
3,0.731898
4,0.759295
5,0.726027
6,0.74364
7,0.719178
8,0.712329
9,0.726027


In [39]:
precisions

[0.15309446254071662,
 0.12804878048780488,
 0.14193548387096774,
 0.13157894736842105,
 0.13432835820895522,
 0.13836477987421383,
 0.13448275862068965,
 0.13538461538461538,
 0.11635220125786164,
 0.1337579617834395,
 0.12861736334405144,
 0.12654320987654322,
 0.12693498452012383,
 0.13175675675675674,
 0.1263537906137184,
 0.12,
 0.138801261829653,
 0.12987012987012986,
 0.1384083044982699,
 0.1413427561837456,
 0.12337662337662338,
 0.12461059190031153,
 0.1498371335504886,
 0.14527027027027026,
 0.134185303514377,
 0.11805555555555555,
 0.11254019292604502,
 0.13680781758957655,
 0.13513513513513514,
 0.1289308176100629,
 0.11904761904761904,
 0.14333333333333334,
 0.13725490196078433,
 0.11960132890365449,
 0.14331210191082802,
 0.14715719063545152,
 0.13804713804713806,
 0.12363636363636364,
 0.13414634146341464,
 0.13333333333333333,
 0.12582781456953643,
 0.1305732484076433,
 0.1276595744680851,
 0.13664596273291926,
 0.1377245508982036,
 0.1365079365079365,
 0.11627906976744

In [40]:
precisions_df = pd.DataFrame(precisions, columns=['Precision'])
print(precisions_df.shape)
precisions_df.head(10)

(1000, 1)


Unnamed: 0,Precision
0,0.153094
1,0.128049
2,0.141935
3,0.131579
4,0.134328
5,0.138365
6,0.134483
7,0.135385
8,0.116352
9,0.133758


In [41]:
results_df = accuracies_df.merge(precisions_df, left_index=True, right_index=True)
print(results_df.shape)
results_df.head(10)

(1000, 2)


Unnamed: 0,Accuracy,Precision
0,0.742661,0.153094
1,0.712329,0.128049
2,0.733855,0.141935
3,0.731898,0.131579
4,0.759295,0.134328
5,0.726027,0.138365
6,0.74364,0.134483
7,0.719178,0.135385
8,0.712329,0.116352
9,0.726027,0.133758


In [42]:
results_df = results_df.merge(recalls_df, left_index=True, right_index=True)
print(results_df.shape)
results_df.head(10)

(1000, 3)


Unnamed: 0,Accuracy,Precision,Recall
0,0.742661,0.153094,0.94
1,0.712329,0.128049,0.84
2,0.733855,0.141935,0.88
3,0.731898,0.131579,0.8
4,0.759295,0.134328,0.72
5,0.726027,0.138365,0.88
6,0.74364,0.134483,0.78
7,0.719178,0.135385,0.88
8,0.712329,0.116352,0.74
9,0.726027,0.133758,0.84


In [43]:
print("Recall mean: " + str(results_df['Recall'].mean()))
print("Recall mode: " + str(results_df['Recall'].mode()))
print("Recall median: " + str(results_df['Recall'].median()))
print("Recall max: " + str(results_df['Recall'].max()))
print("Recall min: " + str(results_df['Recall'].min()))
print("Recall standard deviation: " + str(results_df['Recall'].std()))

Recall mean: 0.8097799999999998
Recall mode: 0    0.8
dtype: float64
Recall median: 0.81
Recall max: 0.96
Recall min: 0.62
Recall standard deviation: 0.05430009300268359


In [44]:
print("Accuracy mean: " + str(results_df['Accuracy'].mean()))
print("Accuracy mode: " + str(results_df['Accuracy'].mode()))
print("Accuracy median: " + str(results_df['Accuracy'].median()))
print("Accuracy max: " + str(results_df['Accuracy'].max()))
print("Accuracy min: " + str(results_df['Accuracy'].min()))
print("Accuracy standard deviation: " + str(results_df['Accuracy'].std()))

Accuracy mean: 0.7281467710371816
Accuracy mode: 0    0.721135
dtype: float64
Accuracy median: 0.7279843444227005
Accuracy max: 0.7681017612524462
Accuracy min: 0.6868884540117417
Accuracy standard deviation: 0.013427165984653969


In [45]:
print("Precision mean: " + str(results_df['Precision'].mean()))
print("Precision mode: " + str(results_df['Precision'].mode()))
print("Precision median: " + str(results_df['Precision'].median()))
print("Precision max: " + str(results_df['Precision'].max()))
print("Precision min: " + str(results_df['Precision'].min()))
print("Precision standard deviation: " + str(results_df['Precision'].std()))

Precision mean: 0.13123779394539875
Precision mode: 0    0.126984
1    0.133333
dtype: float64
Precision median: 0.1310344827586207
Precision max: 0.15436241610738255
Precision min: 0.10240963855421686
Precision standard deviation: 0.008288622689261126


In [46]:
results_df.isnull().sum().sum()

0

In [47]:
len(results_df)

1000

In [48]:
filepath = ('ML_result_files/SVC_AddGlucoseStatus_SimpleImputer(mean)_RandomOverSampler.csv')
results_df.to_csv(filepath, index=False)