# Machine Learning Modeling Using SMOTE and Bootstrapping

In [1]:
%%capture
%matplotlib inline
import pycaret

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from Exploratory_Data_Analysis import *
from pycaret.classification import *

### The Dataset

In [2]:
X = ds.drop("class", axis=1)
y = ds["class"]


### Fit SMOTE to training data

In [3]:
### Instantiating SMOTE
sm = SMOTE(random_state=42)

### Fit SMOTE to training data
X_resampled, y_resampled = sm.fit_resample(X, y)

### Printing class disribution of original and resampled data


In [4]:
print("Class Distribution before Resampleing: ", y.value_counts())

print("\nClass Distribution afer Resampling", y_resampled.value_counts())

Class Distribution before Resampleing:  class
1    123
0     32
Name: count, dtype: int64

Class Distribution afer Resampling class
1    123
0    123
Name: count, dtype: int64


####  Using bootstrapping to further increase dataset size

In [5]:
X_resampled, y_resampled = resample(X_resampled, y_resampled, n_samples=10000, random_state=42)

In [6]:
y_resampled.value_counts()

class
1    5047
0    4953
Name: count, dtype: int64

In [7]:
X_resampled.head()

Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology
102,27,0,0,0,1,0,0,1,1,0,0,0,0,2.4,168.0,227.0,3.0,66.0,1
179,43,0,0,0,1,1,1,1,0,0,1,0,0,4.547882,105.325397,54.270345,3.227035,63.398219,0
92,33,0,1,0,0,0,0,2,2,0,0,0,0,1.0,105.325397,60.0,4.0,62.280899,1
14,47,0,0,1,0,0,0,1,0,0,0,0,0,1.427517,105.325397,60.0,3.817266,62.280899,0
106,35,0,0,0,1,0,0,2,2,1,1,1,0,1.5,138.0,58.0,2.6,62.280899,1


#### combine "X_resampled" and "y_resampled"

In [8]:
train_data = pd.concat([X_resampled, y_resampled], axis=1)

s = setup(data=train_data, target='class', session_id=123, normalize=True, index=False)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,class
2,Target type,Binary
3,Original data shape,"(417530, 20)"
4,Transformed data shape,"(417530, 20)"
5,Transformed train set shape,"(292271, 20)"
6,Transformed test set shape,"(125259, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple


In [9]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,73.348
dt,Decision Tree Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.598
rf,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,15.238
gbc,Gradient Boosting Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,38.798
et,Extra Trees Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,12.58
xgboost,Extreme Gradient Boosting,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.633
lightgbm,Light Gradient Boosting Machine,1.0,1.0,1.0,1.0,1.0,1.0,1.0,203.735
ada,Ada Boost Classifier,0.9933,0.9998,0.9924,0.9945,0.9935,0.9867,0.9867,8.147
svm,SVM - Linear Kernel,0.9005,0.9631,0.8967,0.907,0.9018,0.801,0.8012,0.664
lr,Logistic Regression,0.8968,0.965,0.897,0.9,0.8985,0.7935,0.7935,1.779
