In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.metrics import plot_confusion_matrix

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

from sklearn.svm import SVC
from lightgbm import LGBMClassifier

from sklearn.cluster import KMeans

from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier

In [3]:
raw = pd.read_csv('../Data/dat_raw.csv', index_col=0)
train_X = pd.read_csv('../Data/X_train.csv', index_col=0)
test_X = pd.read_csv('../Data/X_test.csv', index_col=0)
train_y = pd.read_csv('../Data/y_train.csv', index_col=0)
test_y = pd.read_csv('../Data/y_test.csv', index_col=0)

In [4]:
train_X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37216 entries, 0 to 37215
Data columns (total 84 columns):
 #   Column                                                     Non-Null Count  Dtype  
---  ------                                                     --------------  -----  
 0   HeartRate_Mean                                             37216 non-null  float64
 1   SysBP_Mean                                                 37216 non-null  float64
 2   DiasBP_Mean                                                37216 non-null  float64
 3   TempC_Max                                                  37216 non-null  float64
 4   RespRate_Mean                                              37216 non-null  float64
 5   Glucose_Mean                                               37216 non-null  float64
 6   ICU_LOS                                                    37216 non-null  float64
 7   age                                                        37216 non-null  float64
 8   ANIONG

In [5]:
train_X.shape, test_X.shape

((37216, 84), (9304, 84))

In [6]:
train_y.target.unique()

array([1, 3, 2, 4])

perceptron linear seperable checking: 
- training f1 0.5966
- training acc 0.6409
- acc per class: 1 very high, over 90\%

In [7]:
X_training = train_X.copy().values
y_training = train_y.copy().values.ravel()
X_testing = test_X.copy().values
y_testing = test_y.copy().values.ravel()

cluster elbow checking: n_clusters = 4

In [8]:
# # clustering checking
# wcss = []
# for i in range(1,11):
#     kmeans = KMeans(n_clusters=i, init='k-means++', random_state=21)
#     kmeans.fit(X_training)
#     wcss.append(kmeans.inertia_)
    
# plt.plot(range(1,11), wcss)
# plt.title('The Elbow Method')
# plt.xlabel('Number of clusters')
# plt.ylabel('WCSS')
# plt.show()

add a clustering feature

In [10]:
n_clusters = 4

# kmeans clustering
clf = KMeans(n_clusters=n_clusters, random_state=21)
clf.fit(X_training)
y_labels_train = clf.predict(X_training)
y_labels_test = clf.predict(X_testing)

# add as feature
train_X['km_cluster'] = y_labels_train
test_X['km_cluster'] = y_labels_test

### SVM model

In [12]:
X_training = train_X.copy().values
y_training = train_y.copy().values.ravel()
X_testing = test_X.copy().values
y_testing = test_y.copy().values.ravel()

lightgbm checking

In [15]:
# 5 fold cv Structure
kf = KFold(n_splits=5, shuffle=True, random_state=123)

i=0
acc = []
f1 = []
for train_index, test_index in kf.split(X_training):
    i += 1
    print('CV Fold {}'.format(i))
    
    # train_test_split
    X_train, X_test = X_training[train_index, :], X_training[test_index, :]
    y_train, y_test = y_training[train_index], y_training[test_index]
    
    # scaling, minmax
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # model, ovo
    clf = OneVsOneClassifier(LGBMClassifier(random_state=0))
    clf.fit(X_train, y_train)

    # make predictions
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test) 
    
    # metrics
    # training
    print('-'*50)
    print('Training f1: {0:.4f}'.format(f1_score(y_train, y_train_pred, average='weighted')))
    print('Training Accuracy: {0:.4f}'.format(accuracy_score(y_train, y_train_pred)))
    # testing
    print('-'*50)
    acc.append(accuracy_score(y_test, y_test_pred))
    f1.append(f1_score(y_test, y_test_pred))
    print('Testing f1: {0:.4f}'.format(f1_score(y_test, y_test_pred, average='weighted')))
    print('Testing Accuracy: {0:.4f}'.format(accuracy_score(y_test, y_test_pred)))
    print('Testing Confusion Matrix:')
    disp = plot_confusion_matrix(clf, X_test, y_test,
                                     cmap=plt.cm.Blues,
                                     normalize='true')
    plt.show()
    
    print('='*75)

print('Mean CV f1: {0:.4f}'.format(np.sum(f1)/5))
print('Mean CV Accuracy: {0:.4f}'.format(np.sum(acc)/5))

CV Fold 1


KeyboardInterrupt: 

In [14]:
# # 5 fold cv Structure
# kf = KFold(n_splits=5, shuffle=True, random_state=123)

# i=0
# acc = []
# f1 = []
# for train_index, test_index in kf.split(X_training):
#     i += 1
#     print('CV Fold {}'.format(i))
    
#     # train_test_split
#     X_train, X_test = X_training[train_index, :], X_training[test_index, :]
#     y_train, y_test = y_training[train_index], y_training[test_index]
    
#     # scaling, minmax
#     scaler = MinMaxScaler()
#     X_train = scaler.fit_transform(X_train)
#     X_test = scaler.transform(X_test)
    
#     # model, ovo
#     clf = OneVsOneClassifier(SVC())
#     clf.fit(X_train, y_train)

#     # make predictions
#     y_train_pred = clf.predict(X_train)
#     y_test_pred = clf.predict(X_test) 
    
#     # metrics
#     # training
#     print('-'*50)
#     print('Training f1: {0:.4f}'.format(f1_score(y_train, y_train_pred, average='weighted')))
#     print('Training Accuracy: {0:.4f}'.format(accuracy_score(y_train, y_train_pred)))
#     # testing
#     print('-'*50)
#     acc.append(accuracy_score(y_test, y_test_pred))
#     f1.append(f1_score(y_test, y_test_pred))
#     print('Testing f1: {0:.4f}'.format(f1_score(y_test, y_test_pred, average='weighted')))
#     print('Testing Accuracy: {0:.4f}'.format(accuracy_score(y_test, y_test_pred)))
#     print('Testing Confusion Matrix:')
#     disp = plot_confusion_matrix(clf, X_test, y_test,
#                                      cmap=plt.cm.Blues,
#                                      normalize='true')
#     plt.show()
    
#     print('='*75)

# print('Mean CV f1: {0:.4f}'.format(np.sum(f1)/5))
# print('Mean CV Accuracy: {0:.4f}'.format(np.sum(acc)/5))

CV Fold 1


KeyboardInterrupt: 