In [19]:
import numpy as np
import pandas as pd
import pickle
import pydotplus
import seaborn as sns
import sys
import time
import warnings
import zipfile

from collections import (Counter,)
from imblearn.under_sampling \
    import (RandomUnderSampler,
            ClusterCentroids,
            TomekLinks,)
from imblearn.over_sampling \
    import (RandomOverSampler,
            SMOTE,)
from io import (StringIO,)
from IPython.display import (Image,)
from matplotlib import (pyplot as plt,)
from mlxtend.classifier import (StackingClassifier,)
from scipy import (stats,)
from scipy.cluster import (hierarchy as sch,)
from sklearn.cluster \
    import (KMeans,
            AgglomerativeClustering,)
from sklearn.datasets import (make_moons,)
from sklearn.decomposition import (PCA,) 
from sklearn.ensemble \
    import (AdaBoostClassifier,
            BaggingClassifier,
            ExtraTreesClassifier,
            GradientBoostingClassifier,
            RandomForestClassifier,
            VotingClassifier,)
from sklearn.feature_selection \
    import (chi2,
            f_regression,
            f_classif,
            RFE,
            SelectFromModel,
            SelectKBest,
            SelectPercentile,)
from sklearn.linear_model \
    import (Lasso,
            LogisticRegression,
            LinearRegression,
            Ridge,)
from sklearn.metrics \
    import (accuracy_score,
            auc,
            classification_report,
            confusion_matrix,
            f1_score,
            precision_score,
            precision_recall_fscore_support as error_metric,
            r2_score,
            recall_score,
            roc_auc_score,
            roc_curve,)
from sklearn.model_selection \
    import (GridSearchCV,
            RandomizedSearchCV,
            train_test_split,)
from sklearn.multiclass import (OneVsRestClassifier,)
from sklearn.pipeline import (make_pipeline,)
from sklearn.preprocessing \
    import (LabelEncoder,
            MinMaxScaler,
            StandardScaler,)
from sklearn.svm \
    import (LinearSVC,
            LinearSVC,
            SVC,)
from sklearn.tree \
    import (DecisionTreeClassifier,
            DecisionTreeRegressor,)
from xgboost import (XGBClassifier,)

warnings.filterwarnings("ignore")
np.set_printoptions(threshold=sys.maxsize)

In [20]:
def get_df_name(df):
    name =[x for x in globals() if globals()[x] is df][0]
    return name

In [21]:
X, y = pd.read_csv("X_cleaned.csv").iloc[:,1:], pd.read_csv("y_cleaned.csv").iloc[:,1:]
print(X.head(1).T)
print(y.head(1).T)

                          0
Agency                 6.00
Agency Type            1.00
Product Name          16.00
Duration               7.00
Destination           56.00
Net Sales              0.00
Commision (in value)  17.82
Age                   31.00
       0
Claim  0


$Undersampling$

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4, stratify=y,)

shuffled_data = pd.concat([X_train, y_train,], axis=1,).sample(frac=1, random_state=4,)

positive_class = shuffled_data[shuffled_data['Claim']==1]
negative_class = shuffled_data[shuffled_data['Claim']==0].sample(n=len(positive_class), random_state=4,)

undersampled_df = pd.concat([negative_class, positive_class],)

X_train, y_train = undersampled_df.drop('Claim', axis=1,), undersampled_df['Claim'].copy()

for i in (X_train, X_test, y_train, y_test):
    i.to_csv(f'{get_df_name(i)}_undersampled.csv')

$Oversampling$

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4,)

oversampler = RandomOverSampler(sampling_strategy='not majority', random_state=4,)
X_train, y_train = oversampler.fit_resample(X_train, y_train,)

for i in (X_train, X_test, y_train, y_test):
    i.to_csv(f'{get_df_name(i)}_oversampled.csv')

$SMOTE$

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4,)

smote = SMOTE(sampling_strategy='not majority', random_state=4,)
X_train, y_train = smote.fit_resample(X_train, y_train,)

for i in (X_train, X_test, y_train, y_test):
    i.to_csv(f'{get_df_name(i)}_smote.csv')