# [How to Handle SMOTE Data in Imbalanced Classification Problems](https://towardsdatascience.com/how-to-handle-smote-data-in-imbalanced-classification-problems-cf4b86e8c6a1)

In [1]:
import pandas as pd

In [2]:
from sklearn import datasets
cancer = datasets.load_breast_cancer()

In [3]:
X = cancer.data
y = cancer.target

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

In [5]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, solver='newton-cg', max_iter=1000)

In [6]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
sm = SMOTE(random_state=0)

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [13]:
pipe = make_pipeline(sm, clf)

In [14]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('smote', SMOTE(random_state=0)),
                ('logisticregression',
                 LogisticRegression(max_iter=1000, random_state=0,
                                    solver='newton-cg'))])

In [15]:
preds = pipe.predict(X_test)

In [16]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipe, X_train, y_train, cv=3, scoring="accuracy")
diff = scores.mean() - pipe.score(X_test, y_test)
SD = diff / scores.std()

In [17]:
from sklearn.metrics import confusion_matrix

print(f"Training Score:{pipe.score(X_train, y_train)}")
print(f"Cross V Score: {scores.mean()} +/- {scores.std()}")
print(f"Testing Score: {pipe.score(X_test, y_test)}")
print(f"Cross & Test Diff: {diff}")
print(f"Standard Deviations Away: {SD}")
print(confusion_matrix(y_test, preds))

Training Score:0.9624413145539906
Cross V Score: 0.9507042253521126 +/- 0.01149995184405251
Testing Score: 0.9440559440559441
Cross & Test Diff: 0.006648281296168568
Standard Deviations Away: 0.5781138378946251
[[50  3]
 [ 5 85]]
