# US Churn &mdash; EDA

## Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown
plt.style.use("seaborn-darkgrid")
pd.set_option('display.max_columns', None)  

import sys, os, yaml

DATASET = "US_Churn"

COLAB = 'google.colab' in sys.modules
if COLAB:
    ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
else:
    ROOT = "./"

DEBUG = False
SEED = 1612

In [2]:
if COLAB:
    from google.colab import drive
    if not os.path.isdir("/content/gdrive"):
        drive.mount("/content/gdrive")

Mounted at /content/gdrive


## Datasets

In [3]:
df = pd.read_pickle(f"{ROOT}/data/churn.pkl")
print(df.shape)
df.head()

(3333, 20)


Unnamed: 0,State,Account_Length,Area_Code,Intl_Plan,VMail_Plan,VMail_Message,Day_Mins,Day_Calls,Day_Charge,Eve_Mins,Eve_Calls,Eve_Charge,Night_Mins,Night_Calls,Night_Charge,Intl_Mins,Intl_Calls,Intl_Charge,CustServ_Calls,Churn
0,KS,128,0,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,0
1,OH,107,0,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0
2,NJ,137,0,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,0
3,OH,84,2,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0
4,OK,75,0,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   State           3333 non-null   category
 1   Account_Length  3333 non-null   int64   
 2   Area_Code       3333 non-null   category
 3   Intl_Plan       3333 non-null   category
 4   VMail_Plan      3333 non-null   category
 5   VMail_Message   3333 non-null   int64   
 6   Day_Mins        3333 non-null   float64 
 7   Day_Calls       3333 non-null   int64   
 8   Day_Charge      3333 non-null   float64 
 9   Eve_Mins        3333 non-null   float64 
 10  Eve_Calls       3333 non-null   int64   
 11  Eve_Charge      3333 non-null   float64 
 12  Night_Mins      3333 non-null   float64 
 13  Night_Calls     3333 non-null   int64   
 14  Night_Charge    3333 non-null   float64 
 15  Intl_Mins       3333 non-null   float64 
 16  Intl_Calls      3333 non-null   int64   
 17  Intl_Charge   

## Target and Features

In [5]:
target = "Churn"
cat_features = [c for c in df.select_dtypes("category").columns if c not in target]
num_features = [c for c in df.select_dtypes(["int","float"]).columns if c not in target]

print(f"Traget: {target}")

print(f"Categorical Features: {cat_features}")
print(f"Numerical Features: {num_features}")

Traget: Churn
Categorical Features: ['State', 'Area_Code', 'Intl_Plan', 'VMail_Plan']
Numerical Features: ['Account_Length', 'VMail_Message', 'Day_Mins', 'Day_Calls', 'Day_Charge', 'Eve_Mins', 'Eve_Calls', 'Eve_Charge', 'Night_Mins', 'Night_Calls', 'Night_Charge', 'Intl_Mins', 'Intl_Calls', 'Intl_Charge', 'CustServ_Calls']


### Target

In [6]:
df[target].value_counts(normalize=True)

0    0.855086
1    0.144914
Name: Churn, dtype: float64

**Comments**

 * Binary classification problem
 * Unbalanced - a dumb classifier (alwayys predicting 0) will have accuracy of 85.5%