# EXPLORATORY DATA ANALYSIS

## Import

In [2]:
import io
import pandas as pd
from google.colab import files

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

## Load Dataset

In [3]:
uploaded = files.upload()

Saving Data Train.csv to Data Train.csv


In [4]:
df = pd.read_csv(io.BytesIO(uploaded['Data Train.csv']))

## Preprocessing

In [5]:
df['international_plan'].replace({'yes':1,'no':0},inplace=True)
df['voice_mail_plan'].replace({'yes':1,'no':0},inplace=True)
df['churn'].replace({'yes':1,'no':0},inplace=True)

## Dataset Information

In [None]:
df.info()

In [None]:
df.head(10)

## Correlation using Heatmap

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(),annot=False)
plt.show()

# MACHINE LEARNING MODEL

## Import

In [10]:
# Importing relevant libraries
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# For loading Dataset
import io
from google.colab import files

# For Normalization 
from sklearn.preprocessing import MinMaxScaler

# For Splitting Dataset
from sklearn.model_selection import train_test_split

## Load Dataset

In [None]:
uploaded = files.upload()

Saving Data Train.csv to Data Train.csv


In [27]:
train_data = pd.read_csv(io.BytesIO(uploaded['Data Train.csv']))

In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4250 entries, 0 to 4249
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   state                          4250 non-null   object 
 1   account_length                 4250 non-null   int64  
 2   area_code                      4250 non-null   object 
 3   international_plan             4250 non-null   object 
 4   voice_mail_plan                4250 non-null   object 
 5   number_vmail_messages          4250 non-null   int64  
 6   total_day_minutes              4250 non-null   float64
 7   total_day_calls                4250 non-null   int64  
 8   total_day_charge               4250 non-null   float64
 9   total_eve_minutes              4250 non-null   float64
 10  total_eve_calls                4250 non-null   int64  
 11  total_eve_charge               4250 non-null   float64
 12  total_night_minutes            4250 non-null   f

In [None]:
train_data.head(10)

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,OH,107,area_code_415,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,no
1,NJ,137,area_code_415,no,no,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,no
2,OH,84,area_code_408,yes,no,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,no
3,OK,75,area_code_415,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,no
4,MA,121,area_code_510,no,yes,24,218.2,88,37.09,348.5,108,29.62,212.6,118,9.57,7.5,7,2.03,3,no
5,MO,147,area_code_415,yes,no,0,157.0,79,26.69,103.1,94,8.76,211.8,96,9.53,7.1,6,1.92,0,no
6,LA,117,area_code_408,no,no,0,184.5,97,31.37,351.6,80,29.89,215.8,90,9.71,8.7,4,2.35,1,no
7,WV,141,area_code_415,yes,yes,37,258.6,84,43.96,222.0,111,18.87,326.4,97,14.69,11.2,5,3.02,0,no
8,IN,65,area_code_415,no,no,0,129.1,137,21.95,228.5,83,19.42,208.8,111,9.4,12.7,6,3.43,4,yes
9,RI,74,area_code_415,no,no,0,187.7,127,31.91,163.4,148,13.89,196.0,94,8.82,9.1,5,2.46,0,no


## Preprocessing

### Data Cleansing

#### Drop unwanted column

In [28]:
train_data.drop(columns=['state', 'area_code', 'total_day_minutes', 
                         'total_eve_minutes', 'total_night_minutes', 
                         'total_intl_minutes'],axis=1,inplace=True)

train_data.head(10)

Unnamed: 0,account_length,international_plan,voice_mail_plan,number_vmail_messages,total_day_calls,total_day_charge,total_eve_calls,total_eve_charge,total_night_calls,total_night_charge,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,107,no,yes,26,123,27.47,103,16.62,103,11.45,3,3.7,1,no
1,137,no,no,0,114,41.38,110,10.3,104,7.32,5,3.29,0,no
2,84,yes,no,0,71,50.9,88,5.26,89,8.86,7,1.78,2,no
3,75,yes,no,0,113,28.34,122,12.61,121,8.41,3,2.73,3,no
4,121,no,yes,24,88,37.09,108,29.62,118,9.57,7,2.03,3,no
5,147,yes,no,0,79,26.69,94,8.76,96,9.53,6,1.92,0,no
6,117,no,no,0,97,31.37,80,29.89,90,9.71,4,2.35,1,no
7,141,yes,yes,37,84,43.96,111,18.87,97,14.69,5,3.02,0,no
8,65,no,no,0,137,21.95,83,19.42,111,9.4,6,3.43,4,yes
9,74,no,no,0,127,31.91,148,13.89,94,8.82,5,2.46,0,no


#### Check for missing values

In [29]:
print("Null data count :")
print(train_data.isnull().sum())

Null data count :
account_length                   0
international_plan               0
voice_mail_plan                  0
number_vmail_messages            0
total_day_calls                  0
total_day_charge                 0
total_eve_calls                  0
total_eve_charge                 0
total_night_calls                0
total_night_charge               0
total_intl_calls                 0
total_intl_charge                0
number_customer_service_calls    0
churn                            0
dtype: int64


In [30]:
print("NaN (Not a Number) data count :")
print(train_data.isna().sum())

NaN (Not a Number) data count :
account_length                   0
international_plan               0
voice_mail_plan                  0
number_vmail_messages            0
total_day_calls                  0
total_day_charge                 0
total_eve_calls                  0
total_eve_charge                 0
total_night_calls                0
total_night_charge               0
total_intl_calls                 0
total_intl_charge                0
number_customer_service_calls    0
churn                            0
dtype: int64


#### Check for duplicated values

In [None]:
print(train_data.duplicated(), "\n")
print("Duplicated values : ", train_data.duplicated().sum())

0       False
1       False
2       False
3       False
4       False
        ...  
4245    False
4246    False
4247    False
4248    False
4249    False
Length: 4250, dtype: bool 

Duplicated values :  0


### Detect Outliers

In [13]:
print(f'Jumlah baris sebelum memfilter outlier: {len(train_data)}')

colummNames = ['account_length',
       'number_vmail_messages', 'total_day_calls',
       'total_day_charge', 'total_eve_calls',
       'total_eve_charge', 'total_night_calls',
       'total_night_charge', 'total_intl_calls',
       'total_intl_charge', 'number_customer_service_calls']
filtered_entries = np.array([True] * len(train_data))
for col in colummNames:
    Q1 = train_data[col].quantile(0.25)
    Q3 = train_data[col].quantile(0.75)
    IQR = Q3 - Q1
    low_limit = Q1 - (IQR * 1.5)
    high_limit = Q3 + (IQR * 1.5)

    filtered_entries = ((train_data[col] >= low_limit) & (train_data[col] <= high_limit)) & filtered_entries
    
train_data = train_data[filtered_entries]

print(f'Jumlah baris setelah memfilter outlier: {len(train_data)}')

Jumlah baris sebelum memfilter outlier: 4250
Jumlah baris setelah memfilter outlier: 3515


### Feature Encoding

In [14]:
train_data['international_plan'].replace({'yes':1,'no':0},inplace=True)
train_data['voice_mail_plan'].replace({'yes':1,'no':0},inplace=True)
train_data['churn'].replace({'yes':1,'no':0},inplace=True)

train_data.head(10)

Unnamed: 0,account_length,international_plan,voice_mail_plan,number_vmail_messages,total_day_calls,total_day_charge,total_eve_calls,total_eve_charge,total_night_calls,total_night_charge,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,107,0,1,26,123,27.47,103,16.62,103,11.45,3,3.7,1,0
1,137,0,0,0,114,41.38,110,10.3,104,7.32,5,3.29,0,0
3,75,1,0,0,113,28.34,122,12.61,121,8.41,3,2.73,3,0
5,147,1,0,0,79,26.69,94,8.76,96,9.53,6,1.92,0,0
7,141,1,1,37,84,43.96,111,18.87,97,14.69,5,3.02,0,0
9,74,0,0,0,127,31.91,148,13.89,94,8.82,5,2.46,0,0
10,168,0,0,0,96,21.9,71,8.92,128,6.35,2,3.02,1,0
11,95,0,0,0,88,26.62,75,21.05,115,8.65,5,3.32,3,0
13,85,0,1,27,139,33.39,90,23.88,75,4.02,4,3.73,1,0
14,76,0,1,33,66,32.25,65,18.09,108,7.46,5,2.7,1,0


### Normalization

In [15]:
train_data = MinMaxScaler().fit_transform(train_data)

train_data[:10]

array([[0.52216749, 0.        , 1.        , 0.65      , 0.72115385,
        0.43680485, 0.52336449, 0.48457193, 0.53271028, 0.70234114,
        0.22222222, 0.74933333, 0.33333333, 0.        ],
       [0.66995074, 0.        , 0.        , 0.        , 0.63461538,
        0.71809909, 0.58878505, 0.20990874, 0.54205607, 0.35702341,
        0.44444444, 0.64      , 0.        , 0.        ],
       [0.36453202, 1.        , 0.        , 0.        , 0.625     ,
        0.45439838, 0.70093458, 0.31029987, 0.70093458, 0.44816054,
        0.22222222, 0.49066667, 1.        , 0.        ],
       [0.71921182, 1.        , 0.        , 0.        , 0.29807692,
        0.42103134, 0.43925234, 0.14298131, 0.46728972, 0.54180602,
        0.55555556, 0.27466667, 0.        , 0.        ],
       [0.68965517, 1.        , 1.        , 0.925     , 0.34615385,
        0.770273  , 0.59813084, 0.5823555 , 0.47663551, 0.97324415,
        0.44444444, 0.568     , 0.        , 0.        ],
       [0.35960591, 0.        , 0. 

## Model

### Make Predictor and Target Variable (X and y)

In [16]:
# Make predictor variable, 
# drop target column (churn column)
X = np.delete(train_data, 13, 1)

# Make target variable
y = train_data[:, 13]

### Split Dataset into Train and Test

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 64)

### Model LightGBM

In [18]:
# Model LightGBM (MODEL FINAL PERTAMA YANG DIPILIH)
import lightgbm as lgb

model_lgbm = lgb.LGBMClassifier(verbose=0)
model_lgbm.fit(X_train,y_train)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


### Model Support Vector Machines (SVM)

In [19]:
# Model Support Vector Machines (MODEL FINAL KEDUA YANG DIPILIH)
from sklearn import svm

model_svm = svm.SVC()
model_svm.fit(X_train, y_train)

### Model Random Forest

In [20]:
# Model Random Forest (COBA-COBA SAJA)
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(max_depth=2, random_state=0)
model_rf.fit(X_train, y_train)

### Model Gaussian Naive Bayes

In [21]:
# Model Gaussian Naive Bayes (COBA-COBA SAJA)
from sklearn.naive_bayes import GaussianNB

model_gnb = GaussianNB()
model_gnb.fit(X_train, y_train)

### Model Decision Tree

In [22]:
# Model Decision Tree (COBA-COBA SAJA)
from sklearn.tree import DecisionTreeClassifier

model_dt = DecisionTreeClassifier(random_state=0)
model_dt.fit(X_train, y_train)

### Predict X_test using all the Models

In [23]:
# 2 MODEL FINAL YANG DIPILIH
y_test_predicted_lgbm = model_lgbm.predict(X_test)
y_test_predicted_svm = model_svm.predict(X_test)

# COBA COBA SAJA
y_test_predicted_rf = model_rf.predict(X_test) 
y_test_predicted_gnb = model_gnb.predict(X_test) 
y_test_predicted_dt = model_dt.predict(X_test) 

### Check Models Accuracy using Prediction Result (y_test_predicted)

In [24]:
from sklearn import metrics

# Accuracy for LGBM (MODEL FINAL PERTAMA)
accuracy_lgbm = metrics.accuracy_score(y_test,y_test_predicted_lgbm)
confusion_matrix_lgbm = metrics.confusion_matrix(y_test,y_test_predicted_lgbm)
print('Accuracy LGBM :', round(accuracy_lgbm*100, 2),'%')

# Accuracy for Support Vector Machines (MODEL FINAL KEDUA)
accuracy_svm = metrics.accuracy_score(y_test,y_test_predicted_svm)
confusion_matrix_svm = metrics.confusion_matrix(y_test,y_test_predicted_svm)
print('Accuracy Support Vector Machines : ', round(accuracy_svm*100, 2),'%')

# Accuracy for Random Forest (COBA-COBA SAJA)
accuracy_rf = metrics.accuracy_score(y_test,y_test_predicted_rf)
confusion_matrix_rf = metrics.confusion_matrix(y_test,y_test_predicted_rf)
print('Accuracy Random Forest : ', round(accuracy_rf*100, 2),'%')
 
# Accuracy for Gaussian Naive Bayes (COBA-COBA SAJA)
accuracy_gnb = metrics.accuracy_score(y_test,y_test_predicted_gnb)
confusion_matrix_gnb = metrics.confusion_matrix(y_test,y_test_predicted_gnb)
print('Accuracy Gaussian Naive Bayes : ', round(accuracy_gnb*100, 2),'%')

# Accuracy for Decision Tree (COBA-COBA SAJA)
accuracy_dt = metrics.accuracy_score(y_test,y_test_predicted_dt)
confusion_matrix_dt = metrics.confusion_matrix(y_test,y_test_predicted_dt)
print('Accuracy Decision Tree : ', round(accuracy_dt*100, 2),'%')

Accuracy LGBM : 97.3 %
Accuracy Support Vector Machines :  94.17 %
Accuracy Random Forest :  90.75 %
Accuracy Gaussian Naive Bayes :  90.61 %
Accuracy Decision Tree :  92.46 %


## Model Testing

### Loading Dataset

In [None]:
uploaded_test = files.upload()

Saving Data Test.csv to Data Test.csv


In [None]:
test_data = pd.read_csv(io.BytesIO(uploaded_test['Data Test.csv']))

### Preprocessing

In [None]:
test_data.drop(columns=['state', 'area_code', 'total_day_minutes', 
                         'total_eve_minutes', 'total_night_minutes', 
                         'total_intl_minutes'],axis=1,inplace=True)
test_data.drop(columns=['id'],axis=1,inplace=True)

In [None]:
test_data['international_plan'].replace({'yes':1,'no':0},inplace=True)
test_data['voice_mail_plan'].replace({'yes':1,'no':0},inplace=True)

test_data.head(10)

Unnamed: 0,account_length,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls
0,128,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1
1,118,1,0,0,223.4,98,37.98,220.6,101,18.75,203.9,118,9.18,6.3,6,1.7,0
2,62,0,0,0,120.7,70,20.52,307.2,76,26.11,203.0,99,9.14,13.1,6,3.54,4
3,93,0,0,0,190.7,114,32.42,218.2,111,18.55,129.6,121,5.83,8.1,3,2.19,3
4,174,0,0,0,124.3,76,21.13,277.1,112,23.55,250.7,115,11.28,15.5,5,4.19,3
5,142,0,0,0,84.8,95,14.42,136.7,63,11.62,250.5,148,11.27,14.2,6,3.83,2
6,172,0,0,0,212.0,121,36.04,31.2,115,2.65,293.3,78,13.2,12.6,10,3.4,3
7,72,0,1,37,220.0,80,37.4,217.3,102,18.47,152.8,71,6.88,14.7,6,3.97,3
8,149,0,0,0,140.4,94,23.87,271.8,92,23.1,188.3,108,8.47,11.1,9,3.0,1
9,119,0,0,0,159.1,114,27.05,231.3,117,19.66,143.2,91,6.44,8.8,3,2.38,5


In [None]:
test_data = MinMaxScaler().fit_transform(test_data)

test_data[:10]

array([[0.53586498, 0.        , 1.        , 0.49019608, 0.74667455,
        0.57258065, 0.74665276, 0.49984962, 0.48818898, 0.5       ,
        0.61837907, 0.50641026, 0.61843972, 0.52910053, 0.15789474,
        0.52941176, 0.16666667],
       [0.49367089, 1.        , 0.        , 0.        , 0.62341117,
        0.47580645, 0.62336985, 0.56962406, 0.50393701, 0.56970984,
        0.488194  , 0.67948718, 0.48865248, 0.33333333, 0.31578947,
        0.33333333, 0.        ],
       [0.25738397, 0.        , 0.        , 0.        , 0.31983447,
        0.25      , 0.31977047, 0.83007519, 0.30708661, 0.83014862,
        0.48532227, 0.55769231, 0.4858156 , 0.69312169, 0.31578947,
        0.69411765, 0.66666667],
       [0.38818565, 0.        , 0.        , 0.        , 0.5267514 ,
        0.60483871, 0.52669101, 0.56240602, 0.58267717, 0.5626327 ,
        0.25111678, 0.69871795, 0.25106383, 0.42857143, 0.15789474,
        0.42941176, 0.5       ],
       [0.72995781, 0.        , 0.        , 0.      

### Using LightGBM Model

In [None]:
y_final_predicted = model_lgbm.predict(test_data)

In [None]:
i = 1
for y in y_final_predicted[:15]:
  if(y == 1):
    print(i, "Churn : Yes")
  else:
    print(i, "Churn : No")
  i += 1

1 Churn : No
2 Churn : No
3 Churn : No
4 Churn : No
5 Churn : No
6 Churn : No
7 Churn : No
8 Churn : No
9 Churn : No
10 Churn : No
11 Churn : No
12 Churn : No
13 Churn : No
14 Churn : No
15 Churn : No


### Using Support Vector Machines (SVM)

In [None]:
y_final_predicted = model_svm.predict(test_data)

In [None]:
i = 1
for y in y_final_predicted[:15]:
  if(y == 1):
    print(i, "Churn : Yes")
  else:
    print(i, "Churn : No")
  i += 1

1 Churn : No
2 Churn : No
3 Churn : No
4 Churn : No
5 Churn : No
6 Churn : No
7 Churn : No
8 Churn : No
9 Churn : No
10 Churn : No
11 Churn : No
12 Churn : No
13 Churn : No
14 Churn : No
15 Churn : No
