In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import copy

from itertools import product
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.naive_bayes import GaussianNB # Import Naive Bayes Classifier
from sklearn.svm import SVC, LinearSVC # Import SVM Classifier
from sklearn.neighbors import KNeighborsClassifier # Import KNN Classifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn import tree
from pprint import pprint
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import ClusterCentroids, AllKNN
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold
from prettytable import PrettyTable
from scipy import stats
from matplotlib.pyplot import pie, axis, show
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.feature_selection import VarianceThreshold, SelectKBest, mutual_info_classif, SelectFromModel, RFE
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score

from helpers import read_and_process_data, plot_pie_chart

random.seed(0)

In [None]:
data = read_and_process_data('dataset_diabetes/diabetic_data.csv')
data.head(5)

---Retreived data from dataset_diabetes/diabetic_data.csv
Number of records: 101766 Number of features: 50
+--------------------------+---------------------------+
|         Feature          | Number of Unique Elements |
+--------------------------+---------------------------+
|       encounter_id       |           101766          |
|       patient_nbr        |           71518           |
|           race           |             6             |
|          gender          |             3             |
|           age            |             10            |
|          weight          |             10            |
|    admission_type_id     |             8             |
| discharge_disposition_id |             26            |
|   admission_source_id    |             17            |
|     time_in_hospital     |             14            |
|        payer_code        |             18            |
|    medical_specialty     |             73            |
|    num_lab_procedures    |          

In [None]:
features = list(data.columns[0:1]) + list(data.columns[2:])
X = np.array(data[features]) # Features
y = np.array(data[data.columns[1]]).astype(int) # Target variable
print("Input data shape", X.shape, "Output data shape", y.shape)

In [None]:
kfold = StratifiedKFold(10, random_state=False, shuffle=True)

folds = kfold.split(X, y)
folds = [(train, test) for train, test in folds]

In [None]:
estimator = SVR(kernel="linear")
selector = RFE(estimator, n_features_to_select=10, step=1)
selected = selector.fit(X[folds[0][1]], y[folds[0][1]])
print(selected.support_)
print(selected.ranking_)

In [None]:
selected_feat = []
for tak, (i, col) in zip(selected.support_, enumerate(data.columns[:-1])):
    if tak:
        print(col, i)
        selected_feat.append((col, i))
# gender 1
# age 2
# admission_type_id 3
# discharge_disposition_id 4
# admission_source_id 5
# num_procedures 9
# num_medications 10
# number_outpatient 11
# number_emergency 12
# number_inpatient 13
# diag_3 16
# number_diagnoses 17
# max_glu_serum 18
# A1Cresult 19
# repaglinide 21
# nateglinide 22
# chlorpropamide 23
# glimepiride 24
# glipizide 26
# pioglitazone 29
# acarbose 31
# miglitol 32
# tolazamide 34
# glipizide-metformin 39
# diabetesMed 44

In [None]:
X_rfe = selector.transform(X)
X_rfe.shape

In [None]:
# X_cs, y_cs = SMOTEENN(random_state=0).fit_resample(X_rfe, y)

X_cs, y_cs = ADASYN(sampling_strategy='not majority', 
                    n_neighbors=7, random_state=0).fit_resample(X, y)

print('After over-sampling', X_cs.shape, y_cs.shape)

In [None]:
X_cs, y_cs = AllKNN(sampling_strategy='all', n_neighbors=7).fit_resample(X_cs, y_cs)

print('After combined-sampling', X_cs.shape, y_cs.shape)

In [None]:
np.save('npy_data_2/X_combinedSampling_10Features_task2.npy', X_cs)
np.save('npy_data_2/y_combinedSampling_10Features_task2.npy', y_cs)
np.save('npy_data_2/selected_features_10_task2.npy', selected_feat)