# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import seaborn as sns
import plotly.express as px
from scipy import stats
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler

# Import Dataset

In [2]:
df = pd.read_csv('../data/customer_churn_train.csv')
df['area_code'] = df['area_code'].str.split('_').str[2]

df_predict = pd.read_csv('../data/customer_churn_test.csv')

In [3]:
df_numerical = df.select_dtypes(['int64','float64'])
df_category = df.select_dtypes(['object'])

# Data Cleansing

## Remove Outliers

In [4]:
print(f'Jumlah baris sebelum memfilter outlier: {len(df)}')

filtered_entries = np.array([True] * len(df))
for col in df_numerical.columns.tolist():
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    low_limit = Q1 - (IQR * 1.5)
    high_limit = Q3 + (IQR * 1.5)

    filtered_entries = ((df[col] >= low_limit) & (df[col] <= high_limit)) & filtered_entries

df = df[filtered_entries]

print(f'Number of rows after filtering outliers: {len(df)}')

Jumlah baris sebelum memfilter outlier: 4250
Number of rows after filtering outliers: 3515


# Data Transformation

## Feature Encoding

In [5]:
print(f'Number of rows and columns before feature encoding process: {df.shape}')

for cat in ['international_plan','voice_mail_plan','churn']:
  df[cat] = df[cat].astype(str).map({"no":0,"yes":1})

for cat in ['state','area_code']:
  one_hot_enc = pd.get_dummies(df[cat], prefix=cat)
  df = df.join(one_hot_enc)

df = df.drop(['state','area_code'], axis=1)

print(f'Number of rows and columns after feature encodign process: {df.shape}')

Number of rows and columns before feature encoding process: (3515, 20)
Number of rows and columns after feature encodign process: (3515, 72)


## Normalization

In [6]:
from sklearn.preprocessing import MinMaxScaler

for col in df_numerical.columns.tolist():
  df[col] = MinMaxScaler().fit_transform(df[col].values.reshape(len(df),1))

# Export the cleaned data

In [7]:
df.to_csv('../data/customer_churn_train_preprocessed.csv', index=False)