In [1]:
import dtale
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Some functions to visualize the dataset

In [2]:
# To get insights inside columns
def plot_boxplot(df, column_name):

    ax = sns.boxplot(x=df[column_name])
    ax.set_xlabel(column_name)
    ax.set_title(f"Boxplot of {column_name}")

    # adding annotations
    mean_age = df[column_name].mean()
    min_age = df[column_name].min()
    max_age = df[column_name].max()
    q1 = df[column_name].quantile(0.25)
    q3 = df[column_name].quantile(0.75)
    iqr = q3 - q1
    
    ax.annotate(f"Mean {column_name}: {mean_age:.1f}", xy=(mean_age, 0.95), xycoords='data', xytext=(50, 20),
                textcoords='offset points', fontsize=12, color='black', ha='center', va='center',
                arrowprops=dict(arrowstyle='->', lw=1.5, color='black'))
    ax.annotate(f"Min {column_name}: {min_age}", xy=(min_age, 0.05), xycoords='data', xytext=(50, -20),
                textcoords='offset points', fontsize=12, color='black', ha='center', va='center',
                arrowprops=dict(arrowstyle='->', lw=1.5, color='black'))
    ax.annotate(f"Max {column_name}: {max_age}", xy=(max_age, 0.05), xycoords='data', xytext=(-50, -20),
                textcoords='offset points', fontsize=12, color='black', ha='center', va='center',
                arrowprops=dict(arrowstyle='->', lw=1.5, color='black'))
    ax.annotate(f"IQR: {q1:.1f} - {q3:.1f} = {iqr:.1f}", xy=((q1 + q3) / 2, 0.5), xycoords='data', xytext=(0, 20),
                textcoords='offset points', fontsize=12, color='black', ha='center', va='center',
                arrowprops=dict(arrowstyle='-', lw=1.5, color='black'))
    plt.show()

In [3]:
# To plot distribution of columns
def plot_distribution(df, column_name, bin_count=20):

    plt.xlabel(column_name)
    plt.ylabel("Customers count")
    plt.title(f"{column_name} distribution", fontsize=15)
    
    # adding annotations for bins
    bin_edges = np.histogram_bin_edges(df[column_name], bins=bin_count)
    bin_counts, _, _ = plt.hist(df[column_name], bins=bin_edges)
    for i in range(len(bin_edges)-1):
        plt.annotate(f"{bin_counts[i]:.0f}", xy=((bin_edges[i] + bin_edges[i+1]) / 2, bin_counts[i]), 
                     xytext=(0, 5), textcoords='offset points', ha='center', va='bottom')
    plt.show()

In [None]:
# To compare distribution of column with target
def plot_target_column_distribution(df, target, column_name):
    sns.boxplot(x=target, y=column_name,
                hue=target, data=df).set_title(f"{column_name} vs {target}", fontsize=15)
    
    # adding annotations
    counts = df.groupby([target])[column_name].count()

    plt.show()