In [1]:
%matplotlib inline

In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

from sklearn.svm import LinearSVC, SVC, OneClassSVM

from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report

In [3]:
diabetes_data = pd.read_csv('data/dataset_diabetes/diabetic_data.csv')

In [4]:
diabetes_data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [5]:
diabetes_data.dtypes

encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         

In [6]:
diabetes_data.shape

(101766, 50)

In [7]:
diabetes_target = diabetes_data['readmitted']
diabetes_attributes = diabetes_data.drop('readmitted', axis = 1)

In [8]:
diabetes_attributes = pd.get_dummies(diabetes_attributes)

In [9]:
diabetes_attributes.shape

(101766, 2472)

In [10]:
diabetes_attributes_scaled = MinMaxScaler().fit_transform(diabetes_attributes)

In [16]:
all_data, _, all_targets,_ = train_test_split(diabetes_attributes_scaled, diabetes_target, train_size = 0.1)

In [17]:
len(all_data)

10176

In [20]:
attributes_train, attributes_test, targets_train, targets_test = train_test_split(all_data, all_targets, test_size = 0.2, stratify = all_targets)

In [21]:
attributes_train.shape

(8140, 2472)

In [30]:
svm = LinearSVC(C = 10, max_iter = 1e3)

In [31]:
svm.fit(attributes_train, targets_train)



LinearSVC(C=10, max_iter=1000.0)

In [32]:
svm.coef_

array([[-0.12077796,  0.18431837, -0.13196259, ..., -0.07238149,
        -0.1071799 , -0.10382116],
       [-0.58566571,  0.50874991,  0.19021258, ..., -0.01720119,
        -0.02017774,  0.08987642],
       [ 0.77117301, -0.60816938, -0.08323564, ..., -0.00487429,
        -0.00192602, -0.00730364]])

In [36]:
gaussian_svm = SVC(kernel = 'rbf', C = 10)

In [37]:
gaussian_svm.fit(attributes_train, targets_train)

SVC(C=10)

In [40]:
print(classification_report(targets_train, svm.predict(attributes_train)))

              precision    recall  f1-score   support

         <30       0.42      0.44      0.43       911
         >30       0.66      0.49      0.56      2873
          NO       0.70      0.81      0.75      4356

    accuracy                           0.66      8140
   macro avg       0.59      0.58      0.58      8140
weighted avg       0.65      0.66      0.65      8140



In [41]:
print(classification_report(targets_train, gaussian_svm.predict(attributes_train)))

              precision    recall  f1-score   support

         <30       0.97      0.43      0.59       911
         >30       0.86      0.78      0.82      2873
          NO       0.81      0.96      0.88      4356

    accuracy                           0.84      8140
   macro avg       0.88      0.72      0.76      8140
weighted avg       0.85      0.84      0.83      8140



In [42]:
print(classification_report(targets_test, svm.predict(attributes_test)))

              precision    recall  f1-score   support

         <30       0.15      0.17      0.16       228
         >30       0.43      0.32      0.37       718
          NO       0.60      0.68      0.64      1090

    accuracy                           0.50      2036
   macro avg       0.39      0.39      0.39      2036
weighted avg       0.49      0.50      0.49      2036



In [43]:
print(classification_report(targets_test, gaussian_svm.predict(attributes_test)))

              precision    recall  f1-score   support

         <30       0.26      0.05      0.08       228
         >30       0.47      0.39      0.43       718
          NO       0.60      0.77      0.68      1090

    accuracy                           0.56      2036
   macro avg       0.44      0.40      0.39      2036
weighted avg       0.52      0.56      0.52      2036



In [50]:
knn = KNeighborsClassifier(n_neighbors = 5)

In [51]:
knn.fit(attributes_train, targets_train)

KNeighborsClassifier()

In [52]:
knn.predict(attributes_train)

array(['NO', 'NO', 'NO', ..., 'NO', 'NO', 'NO'], dtype=object)

In [53]:
print(classification_report(targets_train, knn.predict(attributes_train)))

              precision    recall  f1-score   support

         <30       0.44      0.32      0.37       911
         >30       0.58      0.64      0.61      2873
          NO       0.73      0.73      0.73      4356

    accuracy                           0.65      8140
   macro avg       0.59      0.56      0.57      8140
weighted avg       0.65      0.65      0.65      8140



In [54]:
print(classification_report(targets_test, knn.predict(attributes_test)))

              precision    recall  f1-score   support

         <30       0.14      0.10      0.11       228
         >30       0.39      0.44      0.41       718
          NO       0.59      0.58      0.59      1090

    accuracy                           0.48      2036
   macro avg       0.37      0.37      0.37      2036
weighted avg       0.47      0.48      0.47      2036



In [57]:
anomaly_detector = OneClassSVM(nu = 0.02)

In [58]:
anomaly_detector.fit(attributes_train)

OneClassSVM(nu=0.02)

In [61]:
predictions = anomaly_detector.predict(attributes_train)

In [62]:
predictions.sum() / len(predictions)

0.9606879606879607