# Cleaning and Preprocessing Data for Machine Learning

In [1]:
import warnings
warnings.simplefilter('ignore')

# %matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
heart_df = pd.read_csv('./Data/heart_failure_clinical_records_dataset.csv')
heart_df

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00,1.1,143,1,1,270,0
295,55.0,0,1820,0,38,0,270000.00,1.2,139,0,0,271,0
296,45.0,0,2060,1,60,0,742000.00,0.8,138,0,0,278,0
297,45.0,0,2413,0,38,0,140000.00,1.4,140,1,1,280,0


In [3]:
heart_df.columns

Index(['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time',
       'DEATH_EVENT'],
      dtype='object')

In [4]:
X = heart_df[['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time']]
y = heart_df['DEATH_EVENT']
print(X.shape, y.shape)


(299, 12) (299,)


## Putting it all together

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [6]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier
classifier.fit(X_train, y_train)

LogisticRegression()

In [7]:
from sklearn.metrics import mean_squared_error, r2_score

print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

print(len(predictions))
print(len(y_test))


Training Data Score: 0.8303571428571429
Testing Data Score: 0.7866666666666666
First 10 Predictions:   [0 0 0 1 0 0 1 0 1 0]
First 10 Actual labels: [0, 0, 1, 1, 0, 0, 1, 0, 1, 0]
75
75


In [8]:
list_y_test = list(y_test)

bool_list = []

for i in range(75):
    if predictions[i] == list_y_test[i]:
        a = "True"
        bool_list.append(a)
    else:
        a = "False"
        bool_list.append(a)

        
count = 0
for item in bool_list:
    if item == "False":
        count += 1

efficiency = 100 - (count / len(predictions) * 100)
print(efficiency)
    

78.66666666666666


In [9]:
new_data = {'age': [70], 
            'anaemia': [0], 
            'creatinine_phosphokinase': [120], 
            'diabetes': [0],
            'ejection_fraction': [18], 
            'high_blood_pressure': [0], 
            'platelets': [280000],
            'serum_creatinine': [1.2], 
            'serum_sodium': [138], 
            'sex': [1], 
            'smoking': [0], 
            'time': [141]}


In [10]:
new_heart_df = pd.DataFrame(new_data)
new_heart_df

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time
0,70,0,120,0,18,0,280000,1.2,138,1,0,141


In [11]:
X_new_data = new_heart_df[['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time']]

In [12]:
new_data_prediction = classifier.predict(X_new_data)
new_data_prediction


array([0], dtype=int64)

In [13]:
from sklearn.metrics import classification_report

In [14]:
print(classification_report(list_y_test, predictions))

              precision    recall  f1-score   support

           0       0.76      0.93      0.84        44
           1       0.86      0.58      0.69        31

    accuracy                           0.79        75
   macro avg       0.81      0.76      0.76        75
weighted avg       0.80      0.79      0.78        75



In [15]:
report = classification_report(list_y_test, predictions, output_dict=True)

In [16]:
report_df = pd.DataFrame(report).transpose()
report_df

Unnamed: 0,precision,recall,f1-score,support
0,0.759259,0.931818,0.836735,44.0
1,0.857143,0.580645,0.692308,31.0
accuracy,0.786667,0.786667,0.786667,0.786667
macro avg,0.808201,0.756232,0.764521,75.0
weighted avg,0.799718,0.786667,0.777038,75.0


In [17]:
report_df = report_df.rename(index={'0': 'living', '1':"deceased"})
report_df

Unnamed: 0,precision,recall,f1-score,support
living,0.759259,0.931818,0.836735,44.0
deceased,0.857143,0.580645,0.692308,31.0
accuracy,0.786667,0.786667,0.786667,0.786667
macro avg,0.808201,0.756232,0.764521,75.0
weighted avg,0.799718,0.786667,0.777038,75.0


In [18]:
report_df = report_df.round(2)

In [26]:
report_df.to_csv("./Data/Report.csv", index = True, index_label = "Index")

#### Using only patient known factors

In [20]:
patient_known_data = heart_df.drop(["time","creatinine_phosphokinase", "ejection_fraction", "platelets", "serum_creatinine", "serum_sodium"], axis=1)
feature_names = patient_known_data.columns
patient_known_data.head()

Unnamed: 0,age,anaemia,diabetes,high_blood_pressure,sex,smoking,DEATH_EVENT
0,75.0,0,0,1,1,0,1
1,55.0,0,0,0,1,0,1
2,65.0,0,0,0,1,1,1
3,50.0,1,0,0,1,0,1
4,65.0,1,1,0,0,0,1


In [21]:
X = heart_df[['age', 'anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking']]
y = heart_df['DEATH_EVENT']
print(X.shape, y.shape)

(299, 6) (299,)


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [23]:
classifier.fit(X_train, y_train)

LogisticRegression()

In [24]:
predictions2 = classifier.predict(X_test)

In [28]:
list_y_test_2 = list(y_test)

In [29]:
report2 = classification_report(list_y_test_2, predictions2, output_dict=True)

In [30]:
report2_df = pd.DataFrame(report2).transpose()
report2_df

Unnamed: 0,precision,recall,f1-score,support
0,0.605634,0.977273,0.747826,44.0
1,0.75,0.096774,0.171429,31.0
accuracy,0.613333,0.613333,0.613333,0.613333
macro avg,0.677817,0.537023,0.459627,75.0
weighted avg,0.665305,0.613333,0.509582,75.0


In [31]:
report2_df = report2_df.rename(index={'0': 'living', '1':"deceased"})

In [32]:
report2_df = report2_df.round(2)

In [33]:
report2_df.to_csv("./Data/Report2.csv", index = True, index_label = "Index")