# Table of Content

- [1.0 Preprocessing](1.0)

  - [1.1 Handling Missing values](#1.1)
  - [1.2 Handling Outlies](#1.2)
  - [1.3 Dublicated rows](#1.3)
  - [1.4 Dublicated columns](#1.4)
  - [1.5 unneeded columns](#1.5)

- [3.0 Feature constuction](#3.0)

- [4.0 Encoding](#3.0)
  - [4.1 OneHotEnconding](#4.1)


In [71]:
# algebra
import numpy as np

# dataframes
import pandas as pd

# EDA & visualizations
import seaborn as sns
import matplotlib.pyplot as plt

# log
import math

# spliting and optimizing models
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score

# linear model
from sklearn.linear_model import (
    LogisticRegression,
    SGDClassifier,
    Lasso,
    Ridge,
    ElasticNet,
)

# Trees
from sklearn.tree import plot_tree, DecisionTreeClassifier, ExtraTreeClassifier


# Ensembles
from sklearn.ensemble import (
    RandomForestClassifier,
    VotingClassifier,
)

# SVMs
from sklearn.svm import SVC

# Feature preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# Model scoring
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    accuracy_score,
    confusion_matrix,
    classification_report,
)
import warnings

# model pipelining
from sklearn.pipeline import Pipeline

# Selection
from sklearn.feature_selection import (
    chi2,
    VarianceThreshold,
    f_classif,
    SelectKBest,
    SelectPercentile,
    SequentialFeatureSelector,
)

pd.options.display.max_columns = None
warnings.filterwarnings("ignore")

In [72]:
bank_df_train = pd.read_csv(
    r"/home/ahmed/Ai/Kaggle-Competitions-Notebooks/Binary Classification with a Bank Churn Dataset/Preprocesse datasets/rename_train.csv"
)
bank_df_test = pd.read_csv(
    r"/home/ahmed/Ai/Kaggle-Competitions-Notebooks/Binary Classification with a Bank Churn Dataset/Preprocesse datasets/rename_test.csv",
)
bank_df_test.drop(columns=["Unnamed: 0"], axis="columns", inplace=True)
bank_df_train.drop(columns=["Unnamed: 0"], axis="columns", inplace=True)
combine = [bank_df_test, bank_df_train]

In [73]:
bank_df_test

Unnamed: 0,id,name,score,country,gender,age,number_of_years,balance,used_products,has_card,activeness,salary
0,15773898,Lucchese,586,France,Female,23.0,2,0.00,2,0.0,1.0,160976.75
1,15782418,Nott,683,France,Female,46.0,2,0.00,1,1.0,0.0,72549.27
2,15807120,K?,656,France,Female,34.0,7,0.00,2,1.0,0.0,138882.09
3,15808905,O'Donnell,681,France,Male,36.0,8,0.00,1,1.0,0.0,113931.57
4,15607314,Higgins,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.00
...,...,...,...,...,...,...,...,...,...,...,...,...
110018,15662091,P'eng,570,Spain,Male,29.0,7,116099.82,1,1.0,1.0,148087.62
110019,15774133,Cox,575,France,Female,36.0,4,178032.53,1,1.0,1.0,42181.68
110020,15728456,Ch'iu,712,France,Male,31.0,2,0.00,2,1.0,0.0,16287.38
110021,15687541,Yegorova,709,France,Female,32.0,3,0.00,1,1.0,1.0,158816.58


# [1.0 Preprocessing](1.0)


- [1.1 Handling Missing values](#1.1)


In [74]:
for feature in bank_df_train.columns:
    if bank_df_train[feature].isnull().sum() > 0:
        print(feature)

In [75]:
for feature in bank_df_test.columns:
    if bank_df_test[feature].isnull().sum() > 0:
        print(feature)

There is no null values


[1.2 Handling Outlies](#1.2)


In [76]:
def get_outliers(feature):
    Q1 = bank_df_train[feature].quantile(0.25)
    Q2 = bank_df_train[feature].quantile(0.75)
    iqr = Q2 - Q1
    lower_limit = Q1 - 1.5 * iqr
    upper_limit = Q2 + 1.5 * iqr
    IQR_df = bank_df_train[
        (bank_df_train[feature] < lower_limit) | (bank_df_train[feature] > upper_limit)
    ]
    return (len(IQR_df)), lower_limit, upper_limit

In [77]:
for feature in bank_df_train.columns:
    if (
        bank_df_train[feature].dtype == "O"
        or feature == "target"
        or len(bank_df_train[feature].value_counts()) < 11
    ):
        continue
    outs, low, uper = get_outliers(feature)
    if outs:
        bank_df_train = bank_df_train[
            (bank_df_train[feature] >= low) & (bank_df_train[feature] <= uper)
        ]

[1.3 Dublicated rows](#1.3)


In [78]:
Dublicated_rows_train = bank_df_train[bank_df_train.duplicated()]
Dublicated_rows_test = bank_df_test[bank_df_test.duplicated()]
len(Dublicated_rows_train), len(Dublicated_rows_test)

(0, 0)

There is no Dublicated rows


[1.4 Dublicated columns](#1.4)


In [79]:
def get_duplicate_columns(df=pd.DataFrame):

    duplicate_columns = {}
    seen_columns = {}

    for column in df.columns:
        current_column = df[column]
        try:
            current_column_hash = current_column.values.tobytes()
        except AttributeError:
            current_column_hash = current_column.to_string().encode()

        if current_column_hash in seen_columns:
            if seen_columns[current_column_hash] in duplicate_columns:
                duplicate_columns[seen_columns[current_column_hash]].append(column)
            else:
                duplicate_columns[seen_columns[current_column_hash]] = [column]
        else:
            seen_columns[current_column_hash] = column

    return duplicate_columns


duplicate_columns_train = get_duplicate_columns(bank_df_train)
duplicate_columns_test = get_duplicate_columns(bank_df_test)

duplicate_columns_train, duplicate_columns_test

({}, {})

There is no Dublicated columns


[1.5 unneeded columns](#1.5)


In [80]:
bank_df_test.drop(columns=["name"], axis="columns", inplace=True)
bank_df_train.drop(columns=["name", "id"], axis="columns", inplace=True)
combine = [bank_df_test, bank_df_train]

# [3.0 Feature constuction](#3.0)


In [81]:
bank_df_train

Unnamed: 0,score,country,gender,age,number_of_years,balance,used_products,has_card,activeness,salary,target
0,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0
1,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0
2,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0
3,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0
...,...,...,...,...,...,...,...,...,...,...,...
165029,667,Spain,Female,33.0,2,0.00,1,1.0,1.0,131834.75,0
165030,792,France,Male,35.0,3,0.00,1,0.0,0.0,131834.45,0
165031,565,France,Male,31.0,5,0.00,1,1.0,1.0,127429.56,0
165032,554,Spain,Female,30.0,7,161533.00,1,0.0,1.0,71173.03,0


The columns i will constuct

    hasCard_and_use_products = has_card == 1 and used_products > 0

    new_customer = number_of_years > 0

    products_by_year = used_products / number_of_years

    balance_salary = balance + salary

    salary_range = dividing salary to ranges

    score_range = dividing score to ranges

    age_ranges = dividing age to ranges

    number_of_years_range = dividing number_of_years to ranges


In [82]:
for dataset in combine:

    dataset["hasCard_and_use_products"] = (dataset["has_card"] == 1) & (
        dataset["used_products"] > 0
    )
    dataset["new_customer"] = dataset["number_of_years"] > 1
    dataset["products_by_year"] = dataset["used_products"] / dataset["number_of_years"]
    dataset["products_by_year"] = np.ceil(dataset["products_by_year"])
    dataset["products_by_year"] = dataset["products_by_year"].replace(np.inf, 0)
    dataset["balance_salary"] = dataset["balance"] + dataset["salary"]
    dataset["hasCard_and_use_products"] = dataset["hasCard_and_use_products"].astype(
        "int32"
    )
    dataset["new_customer"] = dataset["new_customer"].astype("int32")

In [83]:
def salary_range(salary):
    if salary < 20000:
        return 0
    if salary < 50000:
        return 1
    if salary < 100000:
        return 2
    if salary < 150000:
        return 3
    if salary < 180000:
        return 4
    return 5


def score_range(score):
    if score < 300:
        return 0
    if score < 450:
        return 1
    if score < 550:
        return 2
    if score < 650:
        return 3
    return 4


def age_ranges(age):
    if age < 35:
        return 0
    if age < 50:
        return 1
    if age < 60:
        return 2
    if age < 75:
        return 3
    return 4


def number_of_years_range(years):
    if years < 2:
        return 0
    if years < 5:
        return 1
    if years < 8:
        return 3
    return 4


for dataset in combine:
    dataset["salary_range"] = dataset["salary"].apply(salary_range)
    dataset["score_range"] = dataset["score"].apply(score_range)
    dataset["age_ranges"] = dataset["age"].apply(age_ranges)
    dataset["number_of_years_range"] = dataset["number_of_years"].apply(
        number_of_years_range
    )

In [84]:
bank_df_train

Unnamed: 0,score,country,gender,age,number_of_years,balance,used_products,has_card,activeness,salary,target,hasCard_and_use_products,new_customer,products_by_year,balance_salary,salary_range,score_range,age_ranges,number_of_years_range
0,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0,1,1,1.0,181449.97,5,4,0,1
1,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0,1,0,2.0,49503.50,1,3,0,0
2,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0,1,1,1.0,184866.69,5,4,1,4
3,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0,1,1,1.0,233443.42,2,3,0,1
4,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0,1,1,1.0,15068.83,0,4,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165029,667,Spain,Female,33.0,2,0.00,1,1.0,1.0,131834.75,0,1,1,1.0,131834.75,3,4,0,1
165030,792,France,Male,35.0,3,0.00,1,0.0,0.0,131834.45,0,0,1,1.0,131834.45,3,4,1,1
165031,565,France,Male,31.0,5,0.00,1,1.0,1.0,127429.56,0,1,1,1.0,127429.56,3,3,0,3
165032,554,Spain,Female,30.0,7,161533.00,1,0.0,1.0,71173.03,0,0,1,1.0,232706.03,2,3,0,3


# [4.0 Encoding](#3.0)


[4.1 OneHotEnconding](#4.1)


In [85]:
bank_df_test = pd.get_dummies(bank_df_test, columns=["country", "gender"])
bank_df_train = pd.get_dummies(bank_df_train, columns=["country", "gender"])
combine = [bank_df_test, bank_df_train]


for dataset in combine:
    for feature in dataset:
        if dataset[feature].dtype == "bool":
            dataset[feature] = dataset[feature].astype("int32")

In [86]:
bank_df_train.to_csv(
    r"/home/ahmed/Ai/Kaggle-Competitions-Notebooks/Binary Classification with a Bank Churn Dataset/Preprocesse datasets/train_processed.csv"
)
bank_df_test.to_csv(
    r"/home/ahmed/Ai/Kaggle-Competitions-Notebooks/Binary Classification with a Bank Churn Dataset/Preprocesse datasets/test_processed.csv"
)