In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import copy

from itertools import product
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.naive_bayes import GaussianNB # Import Naive Bayes Classifier
from sklearn.svm import SVC, LinearSVC # Import SVM Classifier
from sklearn.neighbors import KNeighborsClassifier # Import KNN Classifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn import tree
from pprint import pprint
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import ClusterCentroids
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold
from prettytable import PrettyTable
from scipy import stats
from matplotlib.pyplot import pie, axis, show
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.feature_selection import VarianceThreshold, SelectKBest, mutual_info_classif, SelectFromModel, RFE
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score

from helpers import read_and_process_data

random.seed(0)

In [2]:
data = read_and_process_data('dataset_diabetes/diabetic_data.csv')
data.head(5)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [15]:
features = data.columns[:-1]
X = np.array(data[features])#[:500] # Features
y = np.array(data[data.columns[-1]]).astype(int)#[:500] # Target variable
print("Input data shape", X.shape, "Output data shape", y.shape)

Input data shape (98053, 45) Output data shape (98053,)


In [16]:
kfold = StratifiedKFold(10, random_state=False, shuffle=True)

folds = kfold.split(X, y)
folds = [(train, test) for train, test in folds]

In [17]:
estimator = SVR(kernel="linear")
selector = RFE(estimator, n_features_to_select=15, step=1)
selected = selector.fit(X[folds[0][1]], y[folds[0][1]])
print(selected.support_)
print(selected.ranking_)

[False False  True False  True False False False False False False  True
  True  True False False False  True  True False False  True  True  True
 False False False False False  True False  True  True False  True False
 False False False  True False False False False False]
[24  9  1  5  1 10 12 13 15  2  6  1  1  1 18 17  3  1  1 11 21  1  1  1
  4 26  8 20 19  1 16  1  1 25  1 29 30 14 22  1 28 31 27 23  7]


In [18]:
selected_feat = []
for tak, (i, col) in zip(selected.support_, enumerate(data.columns[:-1])):
    if tak:
        print(col, i)
        selected_feat.append((col, i))

age 2
discharge_disposition_id 4
number_outpatient 11
number_emergency 12
number_inpatient 13
number_diagnoses 17
max_glu_serum 18
repaglinide 21
nateglinide 22
chlorpropamide 23
pioglitazone 29
acarbose 31
miglitol 32
tolazamide 34
glipizide-metformin 39


In [19]:
X_rfe = selector.transform(X)
X_rfe.shape

(98053, 15)

In [20]:
# X_cs, y_cs = SMOTEENN(random_state=0).fit_resample(X_rfe, y)

X_cs, y_cs = ADASYN(sampling_strateg='not majority', 
                    n_neighbors=7, n_jobs=4, 
                    random_state=0).fit_resample(X_rfe, y)
X_cs, y_cs = ClusterCentroids(sampling_strateg='not minority',
                              voting='soft', n_jobs=4, 
                              random_state=0).fit_resample(X_cs, y_cs)

print(X_cs.shape, y_cs.shape)

(27209, 15) (27209,)


In [22]:
np.save('npy_data_2/X_combinedSampling_15Features_task1.npy', X_cs)
np.save('npy_data_2/y_combinedSampling_15Features_task1.npy', y_cs)
np.save('npy_data_2/selected_features_15_task1.npy', selected_feat)