# Table of Content

- [1.0 Preprocessing](1.0)

  - [1.1 Handling Missing values](#1.1)
  - [1.2 Handling Outlies](#1.2)
  - [1.3 Dublicated rows](#1.3)
  - [1.4 Dublicated columns](#1.4)
  - [1.5 unneeded columns](#1.5)

- [3.0 Feature constuction](#3.0)

- [4.0 Encoding](#3.0)
  - [4.2 OneHotEnconding](#4.2)


In [200]:
# algebra
import numpy as np

# dataframes
import pandas as pd

# EDA & visualizations
import seaborn as sns
import matplotlib.pyplot as plt

# log
import math

# spliting and optimizing models
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score

# linear model
from sklearn.linear_model import (
    LogisticRegression,
    SGDClassifier,
    Lasso,
    Ridge,
    ElasticNet,
)

# Trees
from sklearn.tree import plot_tree, DecisionTreeClassifier, ExtraTreeClassifier


# Ensembles
from sklearn.ensemble import (
    RandomForestClassifier,
    VotingClassifier,
)

# SVMs
from sklearn.svm import SVC

# Feature preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# Model scoring
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    accuracy_score,
    confusion_matrix,
    classification_report,
)
import warnings

# model pipelining
from sklearn.pipeline import Pipeline

# Selection
from sklearn.feature_selection import (
    chi2,
    VarianceThreshold,
    f_classif,
    SelectKBest,
    SelectPercentile,
    SequentialFeatureSelector,
)

pd.options.display.max_columns = None
warnings.filterwarnings("ignore")

In [201]:
bank_df_train = pd.read_csv(
    r"/home/ahmed/Ai/Kaggle-Competitions-Notebooks/Binary Classification with a Bank Churn Dataset/Preprocesse datasets/rename_train.csv"
)
bank_df_test = pd.read_csv(
    r"/home/ahmed/Ai/Kaggle-Competitions-Notebooks/Binary Classification with a Bank Churn Dataset/Preprocesse datasets/rename_test.csv",
)
bank_df_test.drop(columns=["Unnamed: 0"], axis="columns", inplace=True)
bank_df_train.drop(columns=["Unnamed: 0"], axis="columns", inplace=True)
combine = [bank_df_test, bank_df_train]

In [202]:
bank_df_train

Unnamed: 0,id,name,score,country,gender,age,number_of_years,balance,used_products,has_card,activeness,salary,target
0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0
1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0
2,15694510,Hsueh,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0
3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
165029,15667085,Meng,667,Spain,Female,33.0,2,0.00,1,1.0,1.0,131834.75,0
165030,15665521,Okechukwu,792,France,Male,35.0,3,0.00,1,0.0,0.0,131834.45,0
165031,15664752,Hsia,565,France,Male,31.0,5,0.00,1,1.0,1.0,127429.56,0
165032,15689614,Hsiung,554,Spain,Female,30.0,7,161533.00,1,0.0,1.0,71173.03,0


# [1.0 Preprocessing](1.0)


- [1.1 Handling Missing values](#1.1)


In [203]:
for feature in bank_df_train.columns:
    if bank_df_train[feature].isnull().sum() > 0:
        print(feature)

In [204]:
for feature in bank_df_test.columns:
    if bank_df_test[feature].isnull().sum() > 0:
        print(feature)

There is no null values


- [1.3 Dublicated rows](#1.3)


In [205]:
Dublicated_rows_train = bank_df_train[bank_df_train.duplicated()]
Dublicated_rows_test = bank_df_test[bank_df_test.duplicated()]
len(Dublicated_rows_train), len(Dublicated_rows_test)

(0, 0)

There is no Dublicated rows


- [1.4 Dublicated columns](#1.4)


In [206]:
def get_duplicate_columns(df=pd.DataFrame):

    duplicate_columns = {}
    seen_columns = {}

    for column in df.columns:
        current_column = df[column]
        try:
            current_column_hash = current_column.values.tobytes()
        except AttributeError:
            current_column_hash = current_column.to_string().encode()

        if current_column_hash in seen_columns:
            if seen_columns[current_column_hash] in duplicate_columns:
                duplicate_columns[seen_columns[current_column_hash]].append(column)
            else:
                duplicate_columns[seen_columns[current_column_hash]] = [column]
        else:
            seen_columns[current_column_hash] = column

    return duplicate_columns


duplicate_columns_train = get_duplicate_columns(bank_df_train)
duplicate_columns_test = get_duplicate_columns(bank_df_test)

duplicate_columns_train, duplicate_columns_test

({}, {})

There is no Dublicated columns


- [1.5 unneeded columns](#1.5)


In [207]:
for dataset in combine:
    dataset.drop(columns=["id", "name"], axis="columns", inplace=True)

# [3.0 Feature constuction](#3.0)


In [208]:
bank_df_train

Unnamed: 0,score,country,gender,age,number_of_years,balance,used_products,has_card,activeness,salary,target
0,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0
1,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0
2,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0
3,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0
...,...,...,...,...,...,...,...,...,...,...,...
165029,667,Spain,Female,33.0,2,0.00,1,1.0,1.0,131834.75,0
165030,792,France,Male,35.0,3,0.00,1,0.0,0.0,131834.45,0
165031,565,France,Male,31.0,5,0.00,1,1.0,1.0,127429.56,0
165032,554,Spain,Female,30.0,7,161533.00,1,0.0,1.0,71173.03,0


The columns i will constuct

    hasCard_and_use_products = has_card == 1 and used_products > 0

    new_customer = number_of_years > 0

    products_by_year = used_products / number_of_years

    balance_salary = balance + salary

    salary_range = dividing salary to ranges

    score_range = dividing score to ranges

    age_ranges = dividing age to ranges

    number_of_years_range = dividing number_of_years to ranges


In [209]:
for dataset in combine:

    dataset["hasCard_and_use_products"] = (dataset["has_card"] == 1) & (
        dataset["used_products"] > 0
    )
    dataset["new_customer"] = dataset["number_of_years"] > 1
    dataset["products_by_year"] = dataset["used_products"] / dataset["number_of_years"]
    dataset["products_by_year"] = np.ceil(dataset["products_by_year"])
    dataset["balance_salary"] = dataset["balance"] + dataset["salary"]
    dataset["hasCard_and_use_products"] = dataset["hasCard_and_use_products"].astype(
        "int32"
    )
    dataset["new_customer"] = dataset["new_customer"].astype("int32")

In [210]:
def salary_range(salary):
    if salary < 20000:
        return 0
    if salary < 50000:
        return 1
    if salary < 100000:
        return 2
    if salary < 150000:
        return 3
    if salary < 180000:
        return 4
    return 5


def score_range(score):
    if score < 300:
        return 0
    if score < 450:
        return 1
    if score < 550:
        return 2
    if score < 650:
        return 3
    return 4


def age_ranges(age):
    if age < 35:
        return 0
    if age < 50:
        return 1
    if age < 60:
        return 2
    if age < 75:
        return 3
    return 4


def number_of_years_range(years):
    if years < 2:
        return 0
    if years < 5:
        return 1
    if years < 8:
        return 3
    return 4


for dataset in combine:
    dataset["salary_range"] = dataset["salary"].apply(salary_range)
    dataset["score_range"] = dataset["score"].apply(score_range)
    dataset["age_ranges"] = dataset["age"].apply(age_ranges)
    dataset["number_of_years_range"] = dataset["number_of_years"].apply(
        number_of_years_range
    )

In [211]:
bank_df_train

Unnamed: 0,score,country,gender,age,number_of_years,balance,used_products,has_card,activeness,salary,target,hasCard_and_use_products,new_customer,products_by_year,balance_salary,salary_range,score_range,age_ranges,number_of_years_range
0,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0,1,1,1.0,181449.97,5,4,0,1
1,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0,1,0,2.0,49503.50,1,3,0,0
2,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0,1,1,1.0,184866.69,5,4,1,4
3,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0,1,1,1.0,233443.42,2,3,0,1
4,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0,1,1,1.0,15068.83,0,4,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165029,667,Spain,Female,33.0,2,0.00,1,1.0,1.0,131834.75,0,1,1,1.0,131834.75,3,4,0,1
165030,792,France,Male,35.0,3,0.00,1,0.0,0.0,131834.45,0,0,1,1.0,131834.45,3,4,1,1
165031,565,France,Male,31.0,5,0.00,1,1.0,1.0,127429.56,0,1,1,1.0,127429.56,3,3,0,3
165032,554,Spain,Female,30.0,7,161533.00,1,0.0,1.0,71173.03,0,0,1,1.0,232706.03,2,3,0,3


# [4.0 Encoding](#3.0)


- [4.2 OneHotEnconding](#4.2)


In [212]:
bank_df_test = pd.get_dummies(bank_df_test, columns=["country", "gender"])
bank_df_train = pd.get_dummies(bank_df_train, columns=["country", "gender"])

In [214]:
bank_df_train.to_csv(
    r"/home/ahmed/Ai/Kaggle-Competitions-Notebooks/Binary Classification with a Bank Churn Dataset/Preprocesse datasets/train_processed.csv"
)
bank_df_test.to_csv(
    r"/home/ahmed/Ai/Kaggle-Competitions-Notebooks/Binary Classification with a Bank Churn Dataset/Preprocesse datasets/test_processed.csv"
)