# Chapter 03 - Exploratory Data Analysis (EDA)

12/20/2018 - Jeff Smith

Based on material from Larose and Larose, 2015

In [None]:
% matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
np.__version__, pd.__version__

## Using the Churn dataset

In [None]:
churn = pd.read_csv("../data/churn.txt")
print ("The churn dataset has {:,d} records with {:,d} variables each.".format(
    len(churn), len(churn.columns)))
churn.head()

In [None]:
# Overview of the numeric values (note that the AreaCode column is not useful)
churn.describe()

In [None]:
# Have a look at the target variable
churn['Churn'].value_counts().plot(kind='bar');

In [None]:
churn.groupby('Churn')[['Churn']].aggregate(['count'])

In [None]:
# Categorical variables
fig, ax = plt.subplots(1, 2, figsize=(10,5))
churn['IntPlan'].value_counts().plot(kind='bar', ax=ax[0])
churn['VMailPlan'].value_counts().plot(kind='bar', ax=ax[1])
ax[0].set(title="Int. Plan")
ax[1].set(title="VMail Plan");

In [None]:
# Contingency Tables
# Initial version
pd.crosstab(churn.Churn, churn.IntPlan)

In [None]:
# Propotions instead - use a lambda function with apply
pd.crosstab(churn.Churn, churn.IntPlan).apply(lambda r: r/r.sum(), axis=1)

In [None]:
# For later versions of Pandas
pd.crosstab(churn.Churn, churn.IntPlan, normalize="index")

In [None]:
# Look at day mins and night mins
fig, ax = plt.subplots(1, 2, figsize=(10,5))
ax[0].hist(churn['DayMins'])
ax[0].set(xlabel='Day Minutes', ylabel='Frequency',
       title='Day Minutes');
ax[1].hist(churn['NightMins'])
ax[1].set(xlabel='Night Minutes', ylabel='Frequency',
       title='Night Minutes');


In [None]:
# Look at International Calls and Calls to Customer Service
fig, ax = plt.subplots(1, 2, figsize=(10,5))
ax[0].hist(churn['IntlCalls'])
ax[0].set(xlabel='Intl Calls', ylabel='Frequency',
       title='International Calls');
ax[1].hist(churn['CustServCalls'])
ax[1].set(xlabel='Cust Serv Calls', ylabel='Frequency',
       title='Customer Service Calls');
# Need to normalize if we're going to create an "average" metric.

In [None]:
# Correlation between Day/Evening Calls?
ax = plt.axes()
ax.set(xlabel='Evening Calls', ylabel='Day Calls',
       title='Day Calls vs. Evening Calls');
plt.scatter(churn['EveCalls'], churn['DayCalls']);

In [None]:
# From https://towardsdatascience.com/visualizing-data-with-pair-plots-in-python-f228cf529166
# Create the default pairplot - This takes a while to generate, but is very useful.
sns.pairplot(churn);

In [None]:
# Subset of the columns
sns.pairplot(churn[['DayMins', 'DayCalls', 'NightMins', 'NightCalls']]);

In [None]:
sns.pairplot(churn[['AccountLength', 'CustServCalls', 'DayCalls']]);

In [None]:
# Correlation among the predictors
sns.pairplot(churn[['DayMins', 'DayCalls', 'DayCharge']]);

In [None]:
# Pandas native version (using Matplotlib internally).  Note that the semicolon at the end
# supresses the printing of the object text.
pd.plotting.scatter_matrix(churn[['DayMins', 'DayCalls', 'DayCharge']], figsize=(12, 8));