In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
df = pd.read_csv("/kaggle/input/customer-spending-dataset/customer_data.csv")

In [None]:
df.head()

In [None]:
df.shape

<h1 style='text-align:center;font-size:50px;'>Checking data for NaN values</h1>

In [None]:
df.isna().sum()

In [None]:
px.pie(df, names='gender',title='Genders ratio')

In [None]:
px.pie(df, names='education', title='Education ratios')

In [None]:
def plots(df, x, y, title):
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(20, 5))
    grouped = df.groupby(y)
    sns.histplot(df, x=x, hue=y, kde=True, ax=axes[0])
    sns.barplot(x=grouped[x].mean().index,
               y=grouped[x].mean(),
               ax=axes[1])
    for container in axes[1].containers:
        axes[1].bar_label(container, size=12, color='black')
        
    sns.boxplot(x=df[y], y=df[x], ax=axes[2])
    axes[0].set_title("{} distributed by {}".format(x, y), size=15)
    axes[1].set_title("Mean {} by {} levels".format(x, y), size=15)
    axes[2].set_title("Boxplots of {} grouped by {}".format(x, y), size=15)
    plt.suptitle(title, size=25)
    plt.tight_layout()
    plt.show()

In [None]:
titles = ['Income data for each ', 'How much spendings for each ']
for k, i in enumerate(['income', 'spending']):
    for j in ['gender', 'education']:
        plots(df, i, j, titles[k]+"{} group".format(j))

In [None]:
def modified_pairplot(df, _vars, hues):
    n = len(_vars)+1
    fig, axes = plt.subplots(nrows=len(_vars), ncols=n, figsize=(15, 7))
    for i in range(len(_vars)):
        for j in range(n):
            if i == j:
                sns.kdeplot(df, x=_vars[j], hue=hues[0], fill=True, ax=axes[i][j])
            elif j == i+1:
                sns.kdeplot(df, x=_vars[j-1], hue=hues[1], fill=True, ax=axes[i][j])
            elif j > i+1:
                sns.scatterplot(df, x=_vars[j-1], y=_vars[i], hue=hues[1], ax=axes[i][j])
            elif j<i:
                sns.scatterplot(df, x=_vars[j], y=_vars[i], hue=hues[0], ax=axes[i][j])
                
    plt.tight_layout()
    plt.show()

<h1 style='text-align:center;font-size:50px;'>Modified pairplot</h1>
<div style='text-align:center;
            font-size:180%;'>
    <p style='display:inline-block;'>I took basic pairplot idea with parameter corner=False
        <br>
        But difference is that in each corner there are scatterplots and kde plots of two different categorical columns
        <br>
        In this case bottom left data distributed by education
        <br>
        In top right corner data is labelled by gender
        <br>
        Motivation was to save time and space and portray data distribution by two categorical variables in one plot
    </p>
</div>

In [None]:
modified_pairplot(df, ['age', 'income', 'spending'], ['education', 'gender'])

<h1>

<h1 style='text-align:center;font-size:50px;'>Pairplot and pie chart with classes combined</h1>

In [None]:
temp_df = df.copy()
temp_df['gender_education'] = temp_df['gender'] + ' ' + temp_df['education']
sns.pairplot(temp_df, vars=['age', 'income', 'spending'], hue='gender_education')

In [None]:
px.pie(temp_df, names='gender_education')

In [None]:
def top_by(df, x, top):
    grouped = df.groupby('country')
    xvals = grouped[x].mean()
    x_df = pd.DataFrame(xvals).sort_values(by=x, ascending=False)
    fig, axes = plt.subplots(nrows= 1, ncols=2, figsize=(20, 10))
    sns.barplot(x=x_df.index[:top], y=x_df.iloc[:top, 0], ax=axes[0])
    sns.barplot(x=x_df.index[-top:], y=x_df.iloc[-top:, 0], ax=axes[1])
    for i, j in zip(axes[0].containers, axes[1].containers):
        axes[0].bar_label(i, size=15, color='black')
        axes[1].bar_label(j, size=15, color='black')
    
    axes[0].set_title("{} Countires with highest average {} rates".format(top, x), size=20)
    axes[1].set_title("{} Countries with lowest average {} rates".format(top ,x), size=20)
    axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=45)
    axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=45)
    plt.tight_layout()
    plt.show()

<h1 style='font-size:50px;
           text-align:center;'>
    Top countries with highest and lowest income and spending rates</h1>

In [None]:
top_by(df, 'income', 10)
top_by(df, 'spending', 10)