In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from pandas.api.types import is_string_dtype, is_numeric_dtype

df = pd.read_csv('churn.csv')

In [None]:
### 1. Know Your Data ###
df.info()
df.describe()

In [None]:
df.TotalCharges = pd.to_numeric(df.TotalCharges,errors='coerce')

In [None]:
is_NaN = df.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = df[row_has_NaN]
df = df.dropna()
df.info()

In [None]:
# missing values #
missing_count = df.isnull().sum() # the count of missing values
value_count = df.isnull().count() # the count of all values 
missing_percentage = round(missing_count / value_count * 100,2) #the percentage of missing values
missing_df = pd.DataFrame({'count': missing_count, 'percentage': missing_percentage}) #create a dataframe
#print(missing_df)

In [None]:
df =df.drop(['customerID'], axis =1)

In [None]:
### create the categorical variables called "tenureGroup"
df['tenuregroup'] = None
# Insert values to each line as above
# '0-1'
# '2-12'
# ...
df.loc[(df['tenure'] <= 1),'tenuregroup'] = '0-1'
df.loc[(df['tenure'] >=2.) & (df['tenure'] < 13),'tenuregroup'] = '2-12'
df.loc[(df['tenure'] >=13.) & (df['tenure'] < 25),'tenuregroup'] = '13-24'
df.loc[(df['tenure'] >=25.) & (df['tenure'] < 49),'tenuregroup'] = '25-48'
df.loc[(df['tenure'] >= 49.),'tenuregroup'] = '49_'


In [None]:
#reorganize columns
df = df.reindex(columns=['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'tenuregroup', 'Churn'])

In [None]:
# populate the list of numeric attributes and categorical attributes
num_list = []
cat_list = []

for column in df:
    if is_numeric_dtype(df[column]):
        num_list.append(column)
    elif is_string_dtype(df[column]):
        cat_list.append(column)    

print(num_list)
print(cat_list)

In [None]:
df.corr()

In [None]:
### 3. Univaraite Analysis ###

# bar chart and histogram
for column in df:
    plt.figure(column, figsize = (15,4.9))
    plt.title(column)
    plt.yticks(size =12)
    if is_numeric_dtype(df[column]):
        df[column].plot(kind = 'hist')
    elif is_string_dtype(df[column]):
 # show only the TOP 10 value count in each categorical data
        df[column].value_counts()[:10].plot(kind = 'bar')

In [None]:
### 4. Multivariate Analysis ###

# correation matrix and heatmap
correlation = df.corr()
sns.heatmap(correlation, cmap = "GnBu", annot = True)

# pairplot
sns.pairplot(df,height = 2.5)

# grouped bar chart
for i in range(0, len(cat_list)):
    primary_cat = cat_list[i]
    for j in range(0, len(cat_list)):
        secondary_cat = cat_list[j]
    if secondary_cat != primary_cat:
            plt.figure (figsize = (5,5))
            chart = sns.countplot(
                data = df,
                x= primary_cat, 
                hue= secondary_cat,
                palette = 'GnBu',
                order=df[primary_cat].value_counts(normalize=True).iloc[:10].index #show only TOP10
              )

In [None]:
# pairplot with hue
for i in range(0, len(cat_list)):
    hue_cat = cat_list[i]
    sns.pairplot(df, hue = hue_cat)

# box plot
for i in range(0, len(cat_list)):
    cat = cat_list[i]
    for j in range(0, len(num_list)):
        num = num_list[j]
        plt.figure (figsize = (15,15))
        sns.boxplot( x = cat, y = num, data = df, palette = "GnBu")

In [None]:
x, y, hue = "InternetService", "proportion", "Churn"
#hue_order = ["Male", "Female"]

(df[x]
 .groupby(df[hue])
 .value_counts(normalize=True)
 .rename(y)
 .reset_index()
 .pipe((sns.barplot, "data"), x=x, y=y, hue=hue))

In [None]:
df.columns 

In [None]:
x,y= "Churn","proportion"
for i in range(len(df.columns.drop(['tenuregroup']))):
    hue= df.columns.drop(['Churn'])[i]
    (df[x]
         .groupby(df[hue])
         .value_counts() #normalize=True
         .rename(y)
         .reset_index()
         .pipe((sns.barplot, "data"), x=x, y=y, hue=hue))
    plt.show()

In [None]:
y, hue = "proportion", "Churn"
for i in range(len(df.columns.drop(['Churn']))):
    x= df.columns.drop(['Churn'])[i]
    (df[x]
         .groupby(df[hue])
         .value_counts(normalize=True)
         .rename(y)
         .reset_index()
         .pipe((sns.barplot, "data"), x=x, y=y, hue=hue,palette=["m", "g"] ))         
    plt.show()

In [None]:
for i in range(0, len(cat_list)):
    hue_cat = cat_list[i]
    sns.pairplot(df, hue = hue_cat)

In [None]:
sns.pairplot(df,hue= 'Churn')