In [1]:
import pandas as pd
import numpy as np
import torch as th
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss

In [2]:
df = pd.read_csv('train_values.csv')
df_label = pd.read_csv('train_labels.csv')
df_test = pd.read_csv('test_values.csv')

## Data Assessment

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 14 columns):
patient_id                              180 non-null object
slope_of_peak_exercise_st_segment       180 non-null int64
thal                                    180 non-null object
resting_blood_pressure                  180 non-null int64
chest_pain_type                         180 non-null int64
num_major_vessels                       180 non-null int64
fasting_blood_sugar_gt_120_mg_per_dl    180 non-null int64
resting_ekg_results                     180 non-null int64
serum_cholesterol_mg_per_dl             180 non-null int64
oldpeak_eq_st_depression                180 non-null float64
sex                                     180 non-null int64
age                                     180 non-null int64
max_heart_rate_achieved                 180 non-null int64
exercise_induced_angina                 180 non-null int64
dtypes: float64(1), int64(11), object(2)
memory usage: 19.8+ KB


In [4]:
df_label.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 2 columns):
patient_id               180 non-null object
heart_disease_present    180 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.9+ KB


In [5]:
df['thal'].value_counts()

normal               98
reversible_defect    74
fixed_defect          8
Name: thal, dtype: int64

In [6]:
df.describe()

Unnamed: 0,slope_of_peak_exercise_st_segment,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
count,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0
mean,1.55,131.311111,3.155556,0.694444,0.161111,1.05,249.211111,1.01,0.688889,54.811111,149.483333,0.316667
std,0.618838,17.010443,0.938454,0.969347,0.368659,0.998742,52.717969,1.121357,0.464239,9.334737,22.063513,0.466474
min,1.0,94.0,1.0,0.0,0.0,0.0,126.0,0.0,0.0,29.0,96.0,0.0
25%,1.0,120.0,3.0,0.0,0.0,0.0,213.75,0.0,0.0,48.0,132.0,0.0
50%,1.0,130.0,3.0,0.0,0.0,2.0,245.5,0.8,1.0,55.0,152.0,0.0
75%,2.0,140.0,4.0,1.0,0.0,2.0,281.25,1.6,1.0,62.0,166.25,1.0
max,3.0,180.0,4.0,3.0,1.0,2.0,564.0,6.2,1.0,77.0,202.0,1.0


In [7]:
df['slope_of_peak_exercise_st_segment'].value_counts()

1    93
2    75
3    12
Name: slope_of_peak_exercise_st_segment, dtype: int64

In [8]:
df['resting_blood_pressure'].value_counts()

130    26
140    22
120    21
110    11
150    10
160     9
112     8
128     8
125     7
108     5
124     5
135     4
118     4
145     4
132     3
180     3
138     3
134     2
178     2
126     2
142     2
100     2
152     2
136     2
115     2
172     1
102     1
105     1
106     1
117     1
170     1
156     1
155     1
122     1
144     1
94      1
Name: resting_blood_pressure, dtype: int64

In [9]:
df['chest_pain_type'].value_counts()

4    82
3    57
2    28
1    13
Name: chest_pain_type, dtype: int64

In [10]:
df['num_major_vessels'].value_counts()

0    106
1     37
2     23
3     14
Name: num_major_vessels, dtype: int64

In [11]:
df['fasting_blood_sugar_gt_120_mg_per_dl'].value_counts()

0    151
1     29
Name: fasting_blood_sugar_gt_120_mg_per_dl, dtype: int64

In [12]:
df['resting_ekg_results'].value_counts()

2    94
0    85
1     1
Name: resting_ekg_results, dtype: int64

In [13]:
df['chest_pain_type'].value_counts()

4    82
3    57
2    28
1    13
Name: chest_pain_type, dtype: int64

In [14]:
df['serum_cholesterol_mg_per_dl'].value_counts()

204    4
239    4
219    3
254    3
309    3
226    3
303    3
211    3
282    3
233    3
234    3
256    3
263    3
277    2
265    2
203    2
199    2
197    2
188    2
177    2
258    2
149    2
330    2
308    2
283    2
266    2
304    2
267    2
302    2
299    2
      ..
215    1
218    1
200    1
201    1
213    1
205    1
206    1
207    1
209    1
180    1
175    1
321    1
126    1
322    1
325    1
326    1
327    1
335    1
353    1
360    1
141    1
174    1
222    1
160    1
417    1
164    1
167    1
168    1
220    1
172    1
Name: serum_cholesterol_mg_per_dl, Length: 118, dtype: int64

In [15]:
df['oldpeak_eq_st_depression'].value_counts()

0.0    56
1.4    11
1.6    10
1.2     9
1.0     7
0.2     7
0.6     7
0.4     7
0.8     7
1.8     6
2.0     5
0.1     5
0.5     4
2.2     4
2.4     3
1.5     3
1.9     3
2.6     3
2.8     2
3.0     2
2.3     2
0.9     2
4.2     2
0.3     2
3.4     2
1.1     1
2.5     1
1.3     1
3.8     1
3.2     1
5.6     1
3.1     1
0.7     1
6.2     1
Name: oldpeak_eq_st_depression, dtype: int64

In [16]:
df['sex'].value_counts()

1    124
0     56
Name: sex, dtype: int64

In [17]:
df['age'].value_counts()

54    12
58    10
64     9
51     9
44     9
57     8
60     7
62     7
59     6
55     6
65     6
48     6
67     6
41     6
52     5
45     5
63     5
49     4
42     4
50     4
53     4
56     4
66     4
61     3
46     3
68     3
70     3
71     3
40     3
47     2
69     2
43     2
39     2
35     2
34     1
37     1
77     1
38     1
74     1
29     1
Name: age, dtype: int64

In [18]:
df['max_heart_rate_achieved'].value_counts()

162    10
132     6
142     6
150     5
170     5
172     5
173     5
152     5
125     5
165     4
145     4
122     4
158     4
147     4
151     4
154     3
155     3
131     3
160     3
163     3
179     3
130     3
168     3
144     3
140     3
111     3
175     3
126     3
169     3
138     2
       ..
181     2
182     2
103     2
161     2
115     1
202     1
113     1
112     1
120     1
108     1
106     1
105     1
99      1
97      1
117     1
149     1
121     1
123     1
133     1
134     1
137     1
141     1
192     1
167     1
177     1
180     1
184     1
186     1
188     1
96      1
Name: max_heart_rate_achieved, Length: 73, dtype: int64

In [19]:
df['exercise_induced_angina'].value_counts()

0    123
1     57
Name: exercise_induced_angina, dtype: int64

### Quality

* the column `thal` is a string and needs to be encoded
* the column `fasting_blood_sugar_gt_120_mg_per_dl` is a boolean, convert it to bool datatype to avoid confusion
* the column `sex` is a boolean, convert it to bool datatype to avoid confusion
* the column `exercise_induced_angina` is a boolean, convert it to bool datatype to avoid confusion


### Tidyness

the data doesn't have structural issue

In [20]:
## Categorical column

category_cols = ['chest_pain_type', 'fasting_blood_sugar_gt_120_mg_per_dl', 'resting_ekg_results', 'sex',
                'exercise_induced_angina']

## Numeric columns

numerical_cols = ['slope_of_peak_exercise_st_segment', 'resting_blood_pressure', 'num_major_vessels',
                 'serum_cholesterol_mg_per_dl', 'oldpeak_eq_st_depression', 'age', 'max_heart_rate_achieved']

## Cleaning

In [21]:
df_clean = df.copy()
df_test_clean = df_test.copy()

In [22]:
# encode strings in thal column

In [23]:
lbl_encoder = LabelEncoder()

In [24]:
df['thal'].value_counts()

normal               98
reversible_defect    74
fixed_defect          8
Name: thal, dtype: int64

In [25]:
df_test['thal'].value_counts()

normal               54
reversible_defect    30
fixed_defect          6
Name: thal, dtype: int64

In [26]:
lbl_encoder.fit(df_clean['thal'])
df_clean['thal'] = lbl_encoder.transform(df_clean['thal'])

In [27]:
df_test_clean['thal'] = lbl_encoder.transform(df_test_clean['thal'])

In [28]:
# fix data type in blood sugar, sex and exercise induced

In [29]:
df_clean['fasting_blood_sugar_gt_120_mg_per_dl'] = df_clean['fasting_blood_sugar_gt_120_mg_per_dl'].astype(bool)

In [30]:
df_test_clean['fasting_blood_sugar_gt_120_mg_per_dl'] = df_test_clean['fasting_blood_sugar_gt_120_mg_per_dl'].astype(bool)

In [31]:
df_clean['sex'] = df_clean['sex'].astype(bool)

In [32]:
df_test_clean['sex'] = df_test_clean['sex'].astype(bool)

In [33]:
df_clean['exercise_induced_angina'] = df_clean['exercise_induced_angina'].astype(bool)

In [34]:
df_test_clean['exercise_induced_angina'] = df_test_clean['exercise_induced_angina'].astype(bool)

## Try fitting a simple model

In [35]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

In [36]:
df_train = df_clean.merge(df_label, on='patient_id', how='left')

In [37]:
df_train.head()

Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina,heart_disease_present
0,0z64un,1,1,128,2,0,False,2,308,0.0,True,45,170,False,0
1,ryoo3j,2,1,110,3,0,False,0,214,1.6,False,54,158,False,0
2,yt1s1x,1,1,125,4,3,False,2,304,0.0,True,77,162,True,1
3,l2xjde,1,2,152,4,0,False,0,223,0.0,True,40,181,False,1
4,oyt4ek,3,2,178,1,0,False,2,270,4.2,True,59,145,False,0


In [38]:
X = df_train.drop(['patient_id', 'heart_disease_present'], axis=1)
y = df_train['heart_disease_present']

In [82]:
y.value_counts()

0    100
1     80
Name: heart_disease_present, dtype: int64

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [80]:
tree_clf = DecisionTreeClassifier(max_depth=9)

In [83]:
tree_clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=9,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [87]:
tree_clf.score(X_test, y_test)

0.8888888888888888

In [104]:
y_preds = tree_clf.predict_proba(X_test)[:,1]

In [105]:
# metrics used in competition
log_loss(y_test, y_preds)

3.837664032696471

In [107]:
# Use GridSearchCV for parameter tuning

In [108]:
tree = DecisionTreeClassifier()
params = {
    "max_depth":[3,6,9,12,15],
    "min_samples_leaf":[2,3,4,5,6,7,8]
}

In [109]:
grid_cv = GridSearchCV(tree, params, cv=5)

In [110]:
grid_cv.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [3, 6, 9, 12, 15],
                         'min_samples_leaf': [2, 3, 4, 5, 6, 7, 8]},
          

In [111]:
grid_cv.best_params_

{'max_depth': 6, 'min_samples_leaf': 3}

In [119]:
y_preds = grid_cv.predict_proba(X_test)
print(log_loss(y_test, y_preds))

3.954645366042257


## Random Forest

In [115]:
from sklearn.ensemble import RandomForestClassifier

In [117]:
rf = RandomForestClassifier(n_estimators = 100, max_depth=9)

In [118]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=9, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [120]:
y_preds = rf.predict_proba(X_test)
print(log_loss(y_test, y_preds))

0.38415713516996086


In [121]:
rf = RandomForestClassifier(n_estimators=1000)
params = {
    "max_depth":[3,6,9,12,15],
    "min_samples_leaf":[2,3,4,5,6,7,8]
}
grid_rf = GridSearchCV(rf, params, cv=5)

In [122]:
grid_rf.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=1000, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='

In [124]:
grid_rf.best_params_

{'max_depth': 3, 'min_samples_leaf': 8}

In [123]:
y_preds = grid_rf.predict_proba(X_test)
print(log_loss(y_test, y_preds))

0.4255896998055528


## XGBoost

In [125]:
import xgboost as xgb

In [None]:
xg = xgb.XGB

In [126]:
params = {"objective":"binary:logistic", "max_depth":9}

In [90]:
X_predict = df_test_clean.drop(['patient_id'], axis=1)

In [92]:
preds = tree_clf.predict(X_predict)

array([0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0])

In [None]:
df_test = pd.concat([df_test.drop(['thal'],axis=1), pd.get_dummies(df_test['thal'], prefix='thal')], axis=1)
df_test = pd.concat([df_test.drop(['chest_pain_type'],axis=1), pd.get_dummies(df_test['chest_pain_type'], prefix='chest')], axis=1)
df_test = pd.concat([df_test.drop(['sex'],axis=1), pd.get_dummies(df_test['sex'], prefix='sex')], axis=1)
df_test = pd.concat([df_test.drop(['exercise_induced_angina'],axis=1), pd.get_dummies(df_test['exercise_induced_angina'], prefix='exercise')], axis=1)
df_test = pd.concat([df_test.drop(['fasting_blood_sugar_gt_120_mg_per_dl'],axis=1), pd.get_dummies(df_test['fasting_blood_sugar_gt_120_mg_per_dl'], prefix='blood')], axis=1)
df_test = pd.concat([df_test.drop(['slope_of_peak_exercise_st_segment'],axis=1), pd.get_dummies(df_test['slope_of_peak_exercise_st_segment'], prefix='slope')], axis=1)

In [337]:
df['patient_id'] = df['patient_id'].astype(str)
df_label['patient_id'] = df_label['patient_id'].astype(str)

In [338]:
df_label.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 2 columns):
patient_id               180 non-null object
heart_disease_present    180 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.9+ KB


In [339]:
df_join.columns

Index(['patient_id', 'resting_blood_pressure', 'num_major_vessels',
       'resting_ekg_results', 'serum_cholesterol_mg_per_dl',
       'oldpeak_eq_st_depression', 'age', 'max_heart_rate_achieved',
       'heart_disease_present', 'thal_fixed_defect', 'thal_normal',
       'thal_reversible_defect', 'chest_1', 'chest_2', 'chest_3', 'chest_4',
       'sex_0', 'sex_1', 'exercise_0', 'exercise_1', 'blood_0', 'blood_1',
       'slope_1', 'slope_2', 'slope_3'],
      dtype='object')

In [340]:
df_join = pd.merge(df,df_label, on='patient_id')

In [70]:
df_join.columns

Index(['patient_id', 'resting_blood_pressure', 'num_major_vessels',
       'resting_ekg_results', 'serum_cholesterol_mg_per_dl',
       'oldpeak_eq_st_depression', 'age', 'max_heart_rate_achieved',
       'heart_disease_present', 'thal_fixed_defect', 'thal_normal',
       'thal_reversible_defect', 'chest_1', 'chest_2', 'chest_3', 'chest_4',
       'sex_0', 'sex_1', 'exercise_0', 'exercise_1', 'blood_0', 'blood_1',
       'slope_1', 'slope_2', 'slope_3'],
      dtype='object')

In [341]:
df_join = pd.concat([df_join.drop(['thal'],axis=1), pd.get_dummies(df_join['thal'], prefix='thal')], axis=1)
df_join = pd.concat([df_join.drop(['chest_pain_type'],axis=1), pd.get_dummies(df_join['chest_pain_type'], prefix='chest')], axis=1)
df_join = pd.concat([df_join.drop(['sex'],axis=1), pd.get_dummies(df_join['sex'], prefix='sex')], axis=1)
df_join = pd.concat([df_join.drop(['exercise_induced_angina'],axis=1), pd.get_dummies(df_join['exercise_induced_angina'], prefix='exercise')], axis=1)
df_join = pd.concat([df_join.drop(['fasting_blood_sugar_gt_120_mg_per_dl'],axis=1), pd.get_dummies(df_join['fasting_blood_sugar_gt_120_mg_per_dl'], prefix='blood')], axis=1)
df_join = pd.concat([df_join.drop(['slope_of_peak_exercise_st_segment'],axis=1), pd.get_dummies(df_join['slope_of_peak_exercise_st_segment'], prefix='slope')], axis=1)

In [426]:
X = df_join.drop(['patient_id','heart_disease_present'],axis=1)
y = df_join['heart_disease_present']

In [633]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=42, stratify=y)

In [634]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [635]:
features = th.tensor(X_train)
labels = th.tensor(y_train.values)
dataset = TensorDataset(features, labels)
trainloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [636]:
features_test = th.tensor(X_test)
labels_test = th.tensor(y_test.values)
dataset_test = TensorDataset(features_test, labels_test)
testloader = DataLoader(dataset_test)

In [343]:
features = th.tensor(X.drop(['patient_id'], axis=1).values)
labels = th.tensor(y.values)
dataset = TensorDataset(features, labels)
trainloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [699]:
from torch import nn, optim
import torch.nn.functional as F

def MinMax(x):
    scaler = MinMaxScaler()
    scaled_x = scaler.fit_transform(x.detach().numpy())
    return th.tensor(x)
    

class Classifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(23, 512)
        self.fc2 = nn.Linear(512, 64)
        self.fc3 = nn.Linear(64, 1)
        self.dropout1 = nn.Dropout(p=0.5)
        self.dropout2 = nn.Dropout(p=0.25)
    
    def forward(self, x):
        x = x.float()
        x = self.fc1(x)
        x = F.leaky_relu(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = F.leaky_relu(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        x = th.sigmoid(x)
        return x

In [700]:
model = Classifier()

In [701]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

In [702]:
import warnings
warnings.filterwarnings("ignore")

In [703]:
epochs = 100

for e in range(epochs):
    running_loss = 0
    for features, labels in trainloader:
        features = features.float()
        labels = labels.float()
        output = model.forward(features)
        loss = criterion(output, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    else:
        test_loss = 0
        accuracy = 0
        with th.no_grad():
            for features, labels in testloader:
                features = features.float()
                labels = labels.float()
                output = model.forward(features)
            
                accuracy += th.mean((convertToProb(output.reshape(1)).float() == labels).float())
                test_loss += criterion(output, labels)

        print(f"Training loss: {running_loss/len(trainloader)}")
        print(f"Validation loss: {test_loss/len(testloader)}")
        print(f"Accuracy: {accuracy/len(testloader)}")

Training loss: 0.5973690390586853
Validation loss: 0.38778427243232727
Accuracy: 0.8888888955116272
Training loss: 0.45341333746910095
Validation loss: 0.3275359570980072
Accuracy: 0.8333333134651184
Training loss: 0.38610807061195374
Validation loss: 0.31084132194519043
Accuracy: 0.8611111044883728
Training loss: 0.3759234338998795
Validation loss: 0.3644234240055084
Accuracy: 0.8333333134651184
Training loss: 0.373495489358902
Validation loss: 0.3436714708805084
Accuracy: 0.8611111044883728
Training loss: 0.3899455785751343
Validation loss: 0.45251432061195374
Accuracy: 0.7777777910232544
Training loss: 0.3044384211301804
Validation loss: 0.3349950611591339
Accuracy: 0.8611111044883728
Training loss: 0.321978497505188
Validation loss: 0.37746962904930115
Accuracy: 0.8333333134651184
Training loss: 0.31181941032409666
Validation loss: 0.34159979224205017
Accuracy: 0.8611111044883728
Training loss: 0.2791659295558929
Validation loss: 0.4511624872684479
Accuracy: 0.8055555820465088
Trai

In [704]:
model.eval()

Classifier(
  (fc1): Linear(in_features=23, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
  (dropout1): Dropout(p=0.5)
  (dropout2): Dropout(p=0.25)
)

In [705]:
preds = []
for idx, row in df_test.iterrows():
    test_feature = row.drop(['patient_id']).values
    test_feature = scaler.transform(test_feature.reshape(1,-1))
    test_feature = test_feature.astype(float)
    test_tensor = th.tensor(test_feature)
    with th.no_grad():
        output = model.forward(test_tensor)
        preds.append(output)

In [706]:
preds

[tensor([[0.4779]]),
 tensor([[2.5147e-05]]),
 tensor([[1.0000]]),
 tensor([[0.0006]]),
 tensor([[1.0000]]),
 tensor([[3.9547e-06]]),
 tensor([[0.0001]]),
 tensor([[1.]]),
 tensor([[0.8109]]),
 tensor([[0.0221]]),
 tensor([[0.0024]]),
 tensor([[0.9974]]),
 tensor([[0.0961]]),
 tensor([[1.]]),
 tensor([[4.6887e-06]]),
 tensor([[1.3515e-07]]),
 tensor([[1.8679e-08]]),
 tensor([[2.2648e-07]]),
 tensor([[1.0000]]),
 tensor([[0.1702]]),
 tensor([[1.]]),
 tensor([[0.0005]]),
 tensor([[0.0002]]),
 tensor([[4.9976e-05]]),
 tensor([[0.1669]]),
 tensor([[1.0000]]),
 tensor([[1.9294e-08]]),
 tensor([[0.9520]]),
 tensor([[0.9986]]),
 tensor([[1.3542e-06]]),
 tensor([[1.]]),
 tensor([[0.9668]]),
 tensor([[0.0016]]),
 tensor([[0.9364]]),
 tensor([[0.5422]]),
 tensor([[0.0017]]),
 tensor([[0.7506]]),
 tensor([[0.0020]]),
 tensor([[0.0009]]),
 tensor([[7.4486e-06]]),
 tensor([[1.]]),
 tensor([[2.4595e-05]]),
 tensor([[0.9998]]),
 tensor([[0.0032]]),
 tensor([[1.0000]]),
 tensor([[1.6059e-06]]),
 tenso

In [558]:
import csv
with open('test.csv', 'w') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerow(["patient_id","heart_disease_present"])
    for i in range(len(preds)):
        patient_id = df_test.loc[i, "patient_id"]
        pred = preds[i].numpy()[0][0]
        writer.writerow([patient_id, pred])

In [396]:
df_test.loc[0, 'patient_id']

'olalu7'

In [584]:
def convertToProb(tensors):
    return tensors >= 0.5
