In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
%matplotlib inline

from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, PowerTransformer, QuantileTransformer, FunctionTransformer, RobustScaler, StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.ensemble import EasyEnsembleClassifier, RUSBoostClassifier, BalancedBaggingClassifier, BalancedRandomForestClassifier

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier,HistGradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [3]:
na_values = ['nan', 'na','#VALUE!','missing']
train = pd.read_csv('data/TrainingData.csv', na_values=na_values)
test = pd.read_csv('data/testX.csv', na_values=na_values)

In [4]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train['mvar47'] = le.fit_transform(train['mvar47'])
test['mvar47'] = le.transform(test['mvar47'])

In [5]:
X = train.drop(['application_key', 'default_ind'], axis=1)
y = train[['default_ind']]
X.dropna(thresh=len(X)*0.20, axis=1, inplace=True) # 0.20 for no dropping

In [6]:
categorical_cols = [col for col in X.columns if X[col].nunique() < 30]
numerical_cols = [col for col in X.columns if col not in categorical_cols]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0, shuffle=True, stratify=y)

classifier = LogisticRegression(
    penalty='l2',
    C=1.0,
    class_weight='balanced',
    random_state=0,
    solver='lbfgs',
    max_iter=100,
    n_jobs=-1
)

transformer = FunctionTransformer(func=np.log1p)
# transformer = PowerTransformer()
# transformer = QuantileTransformer(output_distribution='normal', random_state=0)
X_train[numerical_cols] = transformer.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = transformer.transform(X_test[numerical_cols])

# scaler = StandardScaler()
# X_train[:] = scaler.fit_transform(X_train[:])
# X_test[:] = scaler.transform(X_test[:])

imputer = SimpleImputer(strategy='median')
X_train[numerical_cols] = imputer.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = imputer.transform(X_test[numerical_cols])

imputer = SimpleImputer(strategy='most_frequent')
X_train[categorical_cols] = imputer.fit_transform(X_train[categorical_cols])
X_test[categorical_cols] = imputer.transform(X_test[categorical_cols])

# over_under_sampler = SMOTEENN(random_state=0, n_jobs=-1)
# over_under_sampler = SMOTETomek(random_state=0, n_jobs=-1)
# X_train, y_train = over_under_sampler.fit_resample(X_train, y_train)

classifier.fit(X_train, y_train.values.ravel())
y_pred = classifier.predict(X_test)

print("Accuracy: %.2f%%" % (accuracy_score(y_test.values.ravel(), y_pred)*100))
print("F1 Score: %.2f%%" % (f1_score(y_test.values.ravel(), y_pred)*100))

Accuracy: 70.06%
F1 Score: 58.03%


Logistic Regression (Unbalanced):

16.97% (No transformation, no scaling, no oversampling)

47.84% (No transformation, standard scaling, no oversampling)

48.60% (Log transformation, no scaling, no oversampling)

48.88% (Log transformation, standard scaling, no oversampling)

52.41% / 53.62% (No transformation, no scaling, with SMOTEENN/SMOTETomek)

55.20% / 57.62% (No transformation, standard scaling, with SMOTEENN/SMOTETomek)

54.78% / 57.75% (Log transformation, no scaling, with SMOTEENN/SMOTETomek)

54.84% / 57.75% (Log transformation, standard scaling, with SMOTEENN/SMOTETomek)

Logistic Regression (Balanced):

53.83% (With balance, no transformation, no scaling, no oversampling)

57.88% (With balance, no transformation, standard scaling, no oversampling)

58.05% (With balance, log transformation, no scaling, no oversampling)

57.97% (With balance, log transformation, standard scaling, no oversampling)

53.61% / 53.62% (With balance, no transformation, no scaling, with SMOTEENN/SMOTETomek)

56.89% / 57.62% (With balance, no transformation, standard scaling, with SMOTEENN/SMOTETomek)

57.30% / 57.75% (With balance, log transformation, no scaling, with SMOTEENN/SMOTETomek)

57.25% / 57.75% (With balance, log transformation, standard scaling, with SMOTEENN/SMOTETomek)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0, shuffle=True, stratify=y)

classifier = GaussianNB()

transformer = FunctionTransformer(func=np.log1p)
# transformer = PowerTransformer()
# transformer = QuantileTransformer(output_distribution='normal', random_state=0)
X_train[numerical_cols] = transformer.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = transformer.transform(X_test[numerical_cols])

# scaler = StandardScaler()
# X_train[:] = scaler.fit_transform(X_train[:])
# X_test[:] = scaler.transform(X_test[:])

imputer = SimpleImputer(strategy='median')
X_train[numerical_cols] = imputer.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = imputer.transform(X_test[numerical_cols])

imputer = SimpleImputer(strategy='most_frequent')
X_train[categorical_cols] = imputer.fit_transform(X_train[categorical_cols])
X_test[categorical_cols] = imputer.transform(X_test[categorical_cols])

over_under_sampler = SMOTEENN(random_state=0, n_jobs=-1)
over_under_sampler = SMOTETomek(random_state=0, n_jobs=-1)
X_train, y_train = over_under_sampler.fit_resample(X_train, y_train)

classifier.fit(X_train, y_train.values.ravel())
y_pred = classifier.predict(X_test)

print("Accuracy: %.2f%%" % (accuracy_score(y_test.values.ravel(), y_pred)*100))
print("F1 Score: %.2f%%" % (f1_score(y_test.values.ravel(), y_pred)*100))



Accuracy: 70.05%
F1 Score: 56.93%


Gaussian Naive Bayes:

52.31% (No transformation, no scaling, no oversampling)

55.78% (No transformation, standard scaling, no oversampling)

55.63% (Log transformation, no scaling, no oversampling)

55.63% (Log transformation, standard scaling, no oversampling)

55.63% / 51.45% (No transformation, no scaling, with SMOTEENN/SMOTETomek)

56.19% / 56.52% (No transformation, standard scaling, with SMOTEENN/SMOTETomek)

56.65% / 56.93% (Log transformation, no scaling, with SMOTEENN/SMOTETomek)

56.60% / 56.78% (Log transformation, standard scaling, with SMOTEENN/SMOTETomek)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0, shuffle=True, stratify=y)

classifier = LinearDiscriminantAnalysis()

transformer = FunctionTransformer(func=np.log1p)
# transformer = PowerTransformer()
# transformer = QuantileTransformer(output_distribution='normal', random_state=0)
X_train[numerical_cols] = transformer.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = transformer.transform(X_test[numerical_cols])

# scaler = StandardScaler()
# X_train[:] = scaler.fit_transform(X_train[:])
# X_test[:] = scaler.transform(X_test[:])

imputer = SimpleImputer(strategy='median')
X_train[numerical_cols] = imputer.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = imputer.transform(X_test[numerical_cols])

imputer = SimpleImputer(strategy='most_frequent')
X_train[categorical_cols] = imputer.fit_transform(X_train[categorical_cols])
X_test[categorical_cols] = imputer.transform(X_test[categorical_cols])

# over_under_sampler = SMOTEENN(random_state=0, n_jobs=-1)
over_under_sampler = SMOTETomek(random_state=0, n_jobs=-1)
X_train, y_train = over_under_sampler.fit_resample(X_train, y_train)

classifier.fit(X_train, y_train.values.ravel())
y_pred = classifier.predict(X_test)

print("Accuracy: %.2f%%" % (accuracy_score(y_test.values.ravel(), y_pred)*100))
print("F1 Score: %.2f%%" % (f1_score(y_test.values.ravel(), y_pred)*100))



Accuracy: 69.45%
F1 Score: 57.70%


Linear Discriminant Analysis:

48.25% (No transformation, no oversampling)

48.90% (Log transformation, no oversampling)

54.51% / 57.72% (No transformation, with SMOTEENN/SMOTETomek)

54.10% / 57.70% (Log transformation, with SMOTEENN/SMOTETomek)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0, shuffle=True, stratify=y)

classifier = QuadraticDiscriminantAnalysis()

transformer = FunctionTransformer(func=np.log1p)
# transformer = PowerTransformer()
# transformer = QuantileTransformer(output_distribution='normal', random_state=0)
X_train[numerical_cols] = transformer.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = transformer.transform(X_test[numerical_cols])

# scaler = StandardScaler()
# X_train[:] = scaler.fit_transform(X_train[:])
# X_test[:] = scaler.transform(X_test[:])

imputer = SimpleImputer(strategy='median')
X_train[numerical_cols] = imputer.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = imputer.transform(X_test[numerical_cols])

imputer = SimpleImputer(strategy='most_frequent')
X_train[categorical_cols] = imputer.fit_transform(X_train[categorical_cols])
X_test[categorical_cols] = imputer.transform(X_test[categorical_cols])

over_under_sampler = SMOTEENN(random_state=0, n_jobs=-1)
# over_under_sampler = SMOTETomek(random_state=0, n_jobs=-1)
X_train, y_train = over_under_sampler.fit_resample(X_train, y_train)

classifier.fit(X_train, y_train.values.ravel())
y_pred = classifier.predict(X_test)

print("Accuracy: %.2f%%" % (accuracy_score(y_test.values.ravel(), y_pred)*100))
print("F1 Score: %.2f%%" % (f1_score(y_test.values.ravel(), y_pred)*100))



Accuracy: 67.60%
F1 Score: 55.74%


Quadratic Discriminant Analysis:

54.71% (No transformation, no oversampling)

54.30% (Log transformation, no oversampling)

54.83% / 55.11% (No transformation, with SMOTEENN/SMOTETomek)

55.74% / 55.66% (Log transformation, with SMOTEENN/SMOTETomek)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0, shuffle=True, stratify=y)

classifier = LinearSVC(
    dual=False,
    C=1.0,
    class_weight='balanced',
    random_state=0
)

transformer = FunctionTransformer(func=np.log1p)
# transformer = PowerTransformer()
# transformer = QuantileTransformer(output_distribution='normal', random_state=0)
X_train[numerical_cols] = transformer.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = transformer.transform(X_test[numerical_cols])

# scaler = StandardScaler()
# X_train[:] = scaler.fit_transform(X_train[:])
# X_test[:] = scaler.transform(X_test[:])

imputer = SimpleImputer(strategy='median')
X_train[numerical_cols] = imputer.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = imputer.transform(X_test[numerical_cols])

imputer = SimpleImputer(strategy='most_frequent')
X_train[categorical_cols] = imputer.fit_transform(X_train[categorical_cols])
X_test[categorical_cols] = imputer.transform(X_test[categorical_cols])

# over_under_sampler = SMOTEENN(random_state=0, n_jobs=-1)
# over_under_sampler = SMOTETomek(random_state=0, n_jobs=-1)
# X_train, y_train = over_under_sampler.fit_resample(X_train, y_train)

classifier.fit(X_train, y_train.values.ravel())
y_pred = classifier.predict(X_test)

print("Accuracy: %.2f%%" % (accuracy_score(y_test.values.ravel(), y_pred)*100))
print("F1 Score: %.2f%%" % (f1_score(y_test.values.ravel(), y_pred)*100))

Accuracy: 69.80%
F1 Score: 57.94%


Support Vector Machine (Unbalanced):

19.18% (No transformation, no scaling, no oversampling)

45.72% (No transformation, standard scaling, no oversampling)

46.54% (Log transformation, no scaling, no oversampling)

46.36% (Log transformation, standard scaling, no oversampling)

52.57% / 54.02% (No transformation, no scaling, with SMOTEENN/SMOTETomek)

54.91% / 57.54% (No transformation, standard scaling, with SMOTEENN/SMOTETomek)

54.53% / 57.73% (Log transformation, no scaling, with SMOTEENN/SMOTETomek)

54.60% / 57.63% (Log transformation, standard scaling, with SMOTEENN/SMOTETomek)

Support Vector Machine (Balanced):

54.66% (No transformation, no scaling, no oversampling)

57.73% (No transformation, standard scaling, no oversampling)

57.97% (Log transformation, no scaling, no oversampling)

58.01% (Log transformation, standard scaling, no oversampling)

53.63% / 54.53% (No transformation, no scaling, with SMOTEENN/SMOTETomek)

56.87% / 57.54% (No transformation, standard scaling, with SMOTEENN/SMOTETomek)

57.04% / 57.67% (Log transformation, no scaling, with SMOTEENN/SMOTETomek)

57.16% / 57.63% (Log transformation, standard scaling, with SMOTEENN/SMOTETomek)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0, shuffle=True, stratify=y)

classifier = RidgeClassifier(
    alpha=1.0,
    fit_intercept=False,
    class_weight='balanced',
    random_state=0
)

transformer = FunctionTransformer(func=np.log1p)
# transformer = PowerTransformer()
# transformer = QuantileTransformer(output_distribution='normal', random_state=0)
X_train[numerical_cols] = transformer.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = transformer.transform(X_test[numerical_cols])

scaler = StandardScaler()
X_train[:] = scaler.fit_transform(X_train[:])
X_test[:] = scaler.transform(X_test[:])

imputer = SimpleImputer(strategy='median')
X_train[numerical_cols] = imputer.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = imputer.transform(X_test[numerical_cols])

imputer = SimpleImputer(strategy='most_frequent')
X_train[categorical_cols] = imputer.fit_transform(X_train[categorical_cols])
X_test[categorical_cols] = imputer.transform(X_test[categorical_cols])

# over_under_sampler = SMOTEENN(random_state=0, n_jobs=-1)
# over_under_sampler = SMOTETomek(random_state=0, n_jobs=-1)
# X_train, y_train = over_under_sampler.fit_resample(X_train, y_train)

classifier.fit(X_train, y_train.values.ravel())
y_pred = classifier.predict(X_test)

print("Accuracy: %.2f%%" % (accuracy_score(y_test.values.ravel(), y_pred)*100))
print("F1 Score: %.2f%%" % (f1_score(y_test.values.ravel(), y_pred)*100))

Accuracy: 68.11%
F1 Score: 57.50%


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0, shuffle=True, stratify=y)

classifier = SGDClassifier(
    loss='hinge',
    penalty='l1',
    alpha=0.0001,
    fit_intercept=False,
    n_jobs=-1,
    random_state=0,
    class_weight='balanced'
)

# transformer = FunctionTransformer(func=np.log1p)
# transformer = PowerTransformer()
# transformer = QuantileTransformer(output_distribution='normal', random_state=0)
# X_train[numerical_cols] = transformer.fit_transform(X_train[numerical_cols])
# X_test[numerical_cols] = transformer.transform(X_test[numerical_cols])

scaler = StandardScaler()
X_train[:] = scaler.fit_transform(X_train[:])
X_test[:] = scaler.transform(X_test[:])

imputer = SimpleImputer(strategy='median')
X_train[numerical_cols] = imputer.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = imputer.transform(X_test[numerical_cols])

imputer = SimpleImputer(strategy='most_frequent')
X_train[categorical_cols] = imputer.fit_transform(X_train[categorical_cols])
X_test[categorical_cols] = imputer.transform(X_test[categorical_cols])

# over_under_sampler = SMOTEENN(random_state=0, n_jobs=-1)
# over_under_sampler = SMOTETomek(random_state=0, n_jobs=-1)
# X_train, y_train = over_under_sampler.fit_resample(X_train, y_train)

classifier.fit(X_train, y_train.values.ravel())
y_pred = classifier.predict(X_test)

print("Accuracy: %.2f%%" % (accuracy_score(y_test.values.ravel(), y_pred)*100))
print("F1 Score: %.2f%%" % (f1_score(y_test.values.ravel(), y_pred)*100))

Accuracy: 68.73%
F1 Score: 57.33%


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0, shuffle=True, stratify=y)

classifier = AdaBoostClassifier(
    n_estimators=50,
    learning_rate=1.0,
    algorithm='SAMME.R',
    random_state=0
)

# transformer = FunctionTransformer(func=np.log1p)
# transformer = PowerTransformer()
# transformer = QuantileTransformer(output_distribution='normal', random_state=0)
# X_train[numerical_cols] = transformer.fit_transform(X_train[numerical_cols])
# X_test[numerical_cols] = transformer.transform(X_test[numerical_cols])

# scaler = StandardScaler()
# X_train[:] = scaler.fit_transform(X_train[:])
# X_test[:] = scaler.transform(X_test[:])

imputer = SimpleImputer(strategy='median')
X_train[numerical_cols] = imputer.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = imputer.transform(X_test[numerical_cols])

imputer = SimpleImputer(strategy='most_frequent')
X_train[categorical_cols] = imputer.fit_transform(X_train[categorical_cols])
X_test[categorical_cols] = imputer.transform(X_test[categorical_cols])

over_under_sampler = SMOTEENN(random_state=0, n_jobs=-1)
# over_under_sampler = SMOTETomek(random_state=0, n_jobs=-1)
X_train, y_train = over_under_sampler.fit_resample(X_train, y_train)

classifier.fit(X_train, y_train.values.ravel())
y_pred = classifier.predict(X_test)

print("Accuracy: %.2f%%" % (accuracy_score(y_test.values.ravel(), y_pred)*100))
print("F1 Score: %.2f%%" % (f1_score(y_test.values.ravel(), y_pred)*100))



Accuracy: 66.20%
F1 Score: 56.31%


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0, shuffle=True, stratify=y)

classifier = BaggingClassifier(
    n_estimators=125,
    max_samples=0.9,
    max_features=0.9,
    bootstrap=True,
    bootstrap_features=True,
    oob_score=False,
    n_jobs=-1,
    random_state=0
)

# transformer = FunctionTransformer(func=np.log1p)
# transformer = PowerTransformer()
# transformer = QuantileTransformer(output_distribution='normal', random_state=0)
# X_train[numerical_cols] = transformer.fit_transform(X_train[numerical_cols])
# X_test[numerical_cols] = transformer.transform(X_test[numerical_cols])

# scaler = StandardScaler()
# X_train[:] = scaler.fit_transform(X_train[:])
# X_test[:] = scaler.transform(X_test[:])

imputer = SimpleImputer(strategy='median')
X_train[numerical_cols] = imputer.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = imputer.transform(X_test[numerical_cols])

imputer = SimpleImputer(strategy='most_frequent')
X_train[categorical_cols] = imputer.fit_transform(X_train[categorical_cols])
X_test[categorical_cols] = imputer.transform(X_test[categorical_cols])

over_under_sampler = SMOTEENN(random_state=0, n_jobs=-1)
# over_under_sampler = SMOTETomek(random_state=0, n_jobs=-1)
X_train, y_train = over_under_sampler.fit_resample(X_train, y_train)

classifier.fit(X_train, y_train.values.ravel())
y_pred = classifier.predict(X_test)

print("Accuracy: %.2f%%" % (accuracy_score(y_test.values.ravel(), y_pred)*100))
print("F1 Score: %.2f%%" % (f1_score(y_test.values.ravel(), y_pred)*100))



Accuracy: 68.50%
F1 Score: 57.14%


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0, shuffle=True, stratify=y)

classifier = XGBClassifier(
    n_estimators=100,
    learning_rate = 0.3,
    n_jobs=-1,
    scale_pos_weight=59145/23855,
    random_state=0,
)

# transformer = FunctionTransformer(func=np.log1p)
# transformer = PowerTransformer()
# transformer = QuantileTransformer(output_distribution='normal', random_state=0)
# X_train[numerical_cols] = transformer.fit_transform(X_train[numerical_cols])
# X_test[numerical_cols] = transformer.transform(X_test[numerical_cols])

# scaler = StandardScaler()
# X_train[:] = scaler.fit_transform(X_train[:])
# X_test[:] = scaler.transform(X_test[:])

imputer = SimpleImputer(strategy='median')
X_train[numerical_cols] = imputer.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = imputer.transform(X_test[numerical_cols])

imputer = SimpleImputer(strategy='most_frequent')
X_train[categorical_cols] = imputer.fit_transform(X_train[categorical_cols])
X_test[categorical_cols] = imputer.transform(X_test[categorical_cols])

# over_under_sampler = SMOTEENN(random_state=0, n_jobs=-1)
# over_under_sampler = SMOTETomek(random_state=0, n_jobs=-1)
# X_train, y_train = over_under_sampler.fit_resample(X_train, y_train)

classifier.fit(X_train, y_train.values.ravel())
y_pred = classifier.predict(X_test)

print("Accuracy: %.2f%%" % (accuracy_score(y_test.values.ravel(), y_pred)*100))
print("F1 Score: %.2f%%" % (f1_score(y_test.values.ravel(), y_pred)*100))

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Accuracy: 73.07%
F1 Score: 58.80%


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0, shuffle=True, stratify=y)

classifier = LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    n_jobs=-1,
    scale_pos_weight=59145/23855,
    random_state=0
)

# transformer = FunctionTransformer(func=np.log1p)
transformer = PowerTransformer()
# transformer = QuantileTransformer(output_distribution='normal', random_state=0)
X_train[numerical_cols] = transformer.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = transformer.transform(X_test[numerical_cols])

scaler = StandardScaler()
X_train[:] = scaler.fit_transform(X_train[:])
X_test[:] = scaler.transform(X_test[:])

# imputer = SimpleImputer(strategy='median')
# X_train[numerical_cols] = imputer.fit_transform(X_train[numerical_cols])
# X_test[numerical_cols] = imputer.transform(X_test[numerical_cols])

# imputer = SimpleImputer(strategy='most_frequent')
# X_train[categorical_cols] = imputer.fit_transform(X_train[categorical_cols])
# X_test[categorical_cols] = imputer.transform(X_test[categorical_cols])

# over_under_sampler = SMOTEENN(random_state=0, n_jobs=-1)
# over_under_sampler = SMOTETomek(random_state=0, n_jobs=-1)
# X_train, y_train = over_under_sampler.fit_resample(X_train, y_train)

classifier.fit(X_train, y_train.values.ravel())
y_pred = classifier.predict(X_test)

print("Accuracy: %.2f%%" % (accuracy_score(y_test.values.ravel(), y_pred)*100))
print("F1 Score: %.2f%%" % (f1_score(y_test.values.ravel(), y_pred)*100))

Accuracy: 72.26%
F1 Score: 59.20%
