In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/benvictoria17/DataVisualization/master/dataset/Credit%20Card%20customers/BankChurners.csv")
df = df.iloc[:, 1:21]
df.head()

In [None]:
def export_png(xlabel, ylabel, image_name):
    """
    Exports plots into png format

    INPUTS:
        xlabel — x axis name
        ylabel — y axis name
        image_name — File name for image with .png extension
    """
    plt.xlabel(xlabel, fontsize=20)
    plt.ylabel(ylabel, fontsize=20)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    plt.savefig(image_name, dpi=600, transparent=False, bbox_inches="tight")
    
    print("Export successful")
def categorical_probability(column):
    """
    Calculates probability distribution of Existing Customer and Attrited Customer

    INPUTS:
        columns — DataFrame of a particular categorical variable
    
    OUTPUT:
        DataFrame with p_existing, p_default for each sub-category, sorted in p_default descending order
    """
    column_df = pd.crosstab(index=df[column], 
                               columns=df["attrition_flag"],
                                margins=True)


    column_unique = df[column].unique()
    all_cols = np.append(column_unique, "col_total")
    column_df.index = all_cols
    column_df.columns = ["attrited","existing", "row_total"]
    
    column_df['p_existing'] = column_df['existing'] / column_df['row_total']

    column_df['p_attrited'] = column_df['attrited'] / column_df['row_total']
    column_df['p_attrited'] = column_df['p_attrited'].round(4)
    column_df_probability = pd.DataFrame( [column_df['p_attrited'], column_df['p_existing']], columns=column_df.index ).T
    column_df_probability.sort_values(by=['p_attrited'], ascending=False, inplace=True)
    column_df_probability.drop(labels=['col_total'], inplace=True)
#     column_df_probability.drop(labels=['p_existing'], axis=1, inplace=True)
    return column_df_probability[:15]

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.columns = ['attrition_flag', 'customer_age', 'gender', 'dependent_count',
       'education_level', 'marital_status', 'income_category', 'card_category',
       'months_on_book', 'total_relationship_count', 'months_inactive_12_month',
       'contacts_count_12_month', 'credit_limit', 'total_revolving_bal',
       'avg_open_to_buy', 'total_amt_change_q4_q1', 'total_trans_amt',
       'total_trans_count', 'total_count_change_q4_q1', 'avg_utilization_ratio']

numerical = ['customer_age', 'dependent_count', 'months_on_book', 
             'total_relationship_count', 'months_inactive_12_month',
             'contacts_count_12_month', 'credit_limit', 'total_revolving_bal',
             'avg_open_to_buy', 'total_amt_change_q4_q1', 'total_trans_amt',
             'total_trans_count', 'total_count_change_q4_q1', 'avg_utilization_ratio']

categorical = ['attrition_flag', 'gender','education_level', 
                    'marital_status', 'income_category', 'card_category']

In [None]:
df[numerical].hist(bins=15, figsize=(15, 20), layout=(7,2))

In [None]:
skewed = []
for skew_val in df.skew():
    if skew_val > 1 or skew_val < -1:
        skewed.append( df.skew()[ df.skew() == skew_val ].index[0] )
        print( df.skew()[ df.skew() == skew_val ].index[0])
        print(round(skew_val, 3) )

In [None]:
sns.set(rc={'figure.figsize': (30, 30)})
for i in range(len(numerical)):
    plt.subplot(4, 4, i + 1)
    sns.kdeplot(df[numerical[i]], shade=True)

In [None]:
fig, ax = plt.subplots( figsize = (20,22) )
res = sns.heatmap( df.corr(), cmap='coolwarm', annot=True, ax = ax, annot_kws={"size": 15.5}, cbar=False)
res.set_xticklabels(res.get_xmajorticklabels(), fontsize = 18, rotation=90)
res.set_yticklabels(res.get_ymajorticklabels(), fontsize = 18)

In [None]:
results = df.corr()[ (df.corr() > 0.2) & (df.corr() != 1) ]

results

In [None]:
fig, ax = plt.subplots( figsize = (8,6) )
sns.scatterplot(x='credit_limit', y='avg_open_to_buy', data = df)

In [None]:
fig, ax = plt.subplots( figsize = (8,6) )
sns.scatterplot(x='total_trans_amt', y='total_trans_count', data = df)

In [None]:
fig, ax = plt.subplots( figsize = (8,6) )
sns.scatterplot(x='customer_age', y='months_on_book', data = df)

In [None]:
fig, ax = plt.subplots( figsize = (8,6) )
sns.scatterplot(x='total_revolving_bal', y='avg_utilization_ratio', data = df)

In [None]:
fig, ax = plt.subplots( figsize = (8,6) )
sns.scatterplot(x='avg_open_to_buy', y='avg_utilization_ratio', data = df)

In [None]:
fig, ax = plt.subplots( figsize = (8,6) )
sns.scatterplot(x='credit_limit', y='avg_utilization_ratio', data = df)

In [None]:
fig, ax = plt.subplots( figsize = (8,6) )
sns.scatterplot(x='total_amt_change_q4_q1', y='total_count_change_q4_q1', data = df)

In [None]:
for cat in categorical:
    print('Variable:', cat)
    print('Unique values:', df[cat].nunique() )
    print( df[cat].value_counts() )
    print()

In [None]:
fig, ax = plt.subplots(2, 3, figsize=(20, 15))
for variable, subplot in zip(categorical, ax.flatten()):
    sns.countplot(x=df[variable], ax=subplot)
    for label in subplot.get_xticklabels():
        label.set_rotation(90)

In [None]:
plt.figure(figsize=(28, 25))
for i in range(len(numerical) - 1):
    plt.subplot(4, 4, i + 1)
    sns.boxplot(x='attrition_flag', y=numerical[i], data=df)
    plt.xlabel('attrition_flag', fontsize=18)
    plt.ylabel(numerical[i], fontsize=18)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    

plt.show()

In [None]:
x = df['attrition_flag']
y = df['months_inactive_12_month']

fig, ax = plt.subplots( figsize = (6,4) )
sns.boxplot(x=x, y=y)