<h3>Marketing Data Streamlit App Functions

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import platform

In [4]:
def get_data(path:str)->pd.DataFrame:
    """
    Generates a dataframe from the 'superstore_data.csv' file located at the given path.
    """
    data = pd.read_csv(path)

    return data

In [5]:
data = get_data('superstore_data.csv')
data.head()

Unnamed: 0,Id,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Response,Complain
0,1826,1970,Graduation,Divorced,84835.0,0,0,6/16/2014,0,189,...,111,189,218,1,4,4,6,1,1,0
1,1,1961,Graduation,Single,57091.0,0,0,6/15/2014,0,464,...,7,0,37,1,7,3,7,5,1,0
2,10476,1958,Graduation,Married,67267.0,0,1,5/13/2014,0,134,...,15,2,30,1,3,2,5,2,0,0
3,1386,1967,Graduation,Together,32474.0,1,1,11/05/2014,0,10,...,0,0,0,1,1,0,2,7,0,0
4,5371,1989,Graduation,Single,21474.0,1,0,08/04/2014,0,6,...,11,0,34,2,3,1,2,7,1,0


In [23]:
def include_ts_bs(data: pd.DataFrame) -> pd.DataFrame:
    """
    Adds total monetary spent and total number of purchases columns to the dataframe (ts).

    Also adds the brackets/bins columns (bs) for the income and birthyear columns.

    Birthyear bins are as follows: '1940-1949', '1950-1959', '1960-1969', '1970-1979', '1980-1989', '1990-1999'

    Income bins are as follows: '0-9999', '10000-19999', '20000-29999', '30000-39999', '40000-49999', '50000-59999', '60000-69999', '70000-79999', '80000-89999', '90000-99999', '100000+'

    Parameters:
    data (pd.DataFrame): The input dataframe containing columns for individual monetary spends and purchases.

    Returns:
    pd.DataFrame: The dataframe with additional columns: 'TotalMntSpent', 'TotalNumPurchases', 'Birthyear_Bin', and 'Income_Bin'.
    """

    #Totals columns:
    data['TotalMntSpent'] = data['MntWines'] + data['MntFruits'] + data['MntMeatProducts'] + data['MntFishProducts'] + data['MntSweetProducts'] + data['MntGoldProds']
    data['TotalNumPurchases'] = data['NumDealsPurchases'] + data['NumWebPurchases'] + data['NumCatalogPurchases'] + data['NumStorePurchases']  

    #Bin columns:
    income_bins = [0, 9999, 19999, 29999, 39999, 49999, 59999, 69999, 79999, 89999, 99999, np.inf]
    income_labels = ['0-9999', '10000-19999', '20000-29999', '30000-39999', '40000-49999', '50000-59999', '60000-69999', '70000-79999', '80000-89999', '90000-99999', '100000+']
    data['Income_Bin'] = pd.cut(data['Income'], bins=income_bins, labels=income_labels, right=False)

    birthyear_bins = [1940, 1950, 1960, 1970, 1980, 1990, 2000]
    birthyear_labels = ['1940-1949', '1950-1959', '1960-1969', '1970-1979', '1980-1989', '1990-1999']
    data['Birthyear_Bin'] = pd.cut(data['Year_Birth'], bins=birthyear_bins, labels=birthyear_labels, right=False)

    return data

In [24]:
data = include_ts_bs(data)
data.head()

Unnamed: 0,Id,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Response,Complain,TotalMntSpent,TotalNumPurchases,Income_Bin,Birthyear_Bin
0,1826,1970,Graduation,Divorced,84835.0,0,0,6/16/2014,0,189,...,4,4,6,1,1,0,1190,15,80000-89999,1970-1979
1,1,1961,Graduation,Single,57091.0,0,0,6/15/2014,0,464,...,7,3,7,5,1,0,577,18,50000-59999,1960-1969
2,10476,1958,Graduation,Married,67267.0,0,1,5/13/2014,0,134,...,3,2,5,2,0,0,251,11,60000-69999,1950-1959
3,1386,1967,Graduation,Together,32474.0,1,1,11/05/2014,0,10,...,1,0,2,7,0,0,11,4,30000-39999,1960-1969
4,5371,1989,Graduation,Single,21474.0,1,0,08/04/2014,0,6,...,3,1,2,7,1,0,91,8,20000-29999,1980-1989


In [19]:
def filter_level(data:pd.DataFrame,metric:str)->dict:
    """
    Filters the dataframe based on the unique level and returns a dictionary with dataframes for each unique level in a set of levels of a given metric e.g different education levels.

    Parameters:
    data (pd.DataFrame): The input dataframe containing the data from the get_data() and include_totals() functions.

    metric (str): The column name in the dataframe to filter by. Can be 'Education','Marital_Status','Birthyear_Bin' or 'Income_Bin'.
    
    Returns:
    dict: A dictionary where the keys are the unique levels and the values are dataframes filtered by each unique level.
    """
    def return_level(data:pd.DataFrame, metric:str,level:str):
        """
        metric can be 'Education','Marital_Status','Birthyear_Bin' or 'Income_Bin'.
        """
        return data[data[str(metric)] == level]
    
    unique_levels = data[metric].unique()

    data_by_metric = {level:return_level(data,metric,level) for level in unique_levels}

    return data_by_metric

In [20]:
data_by_education = filter_level(data,'Education')
data_by_education['Graduation'].head()

Unnamed: 0,Id,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Response,Complain,TotalMntSpent,TotalNumPurchases
0,1826,1970,Graduation,Divorced,84835.0,0,0,6/16/2014,0,189,...,218,1,4,4,6,1,1,0,1190,15
1,1,1961,Graduation,Single,57091.0,0,0,6/15/2014,0,464,...,37,1,7,3,7,5,1,0,577,18
2,10476,1958,Graduation,Married,67267.0,0,1,5/13/2014,0,134,...,30,1,3,2,5,2,0,0,251,11
3,1386,1967,Graduation,Together,32474.0,1,1,11/05/2014,0,10,...,0,1,1,0,2,7,0,0,11,4
4,5371,1989,Graduation,Single,21474.0,1,0,08/04/2014,0,6,...,34,2,3,1,2,7,1,0,91,8


In [17]:
def calculate_metrics(data:dict)->dict:
    """
    Calculates various metrics for each unique level in the provided data.

    Can be used for education level, marital status, income bracket and birthyear (EMIB).

    Parameters:
    data (dict): A dictionary where the keys are different levels and the values are dataframes filtered by those unique levels.

    Returns:
    dict: A dictionary where the keys are unique levels and the values are dictionaries containing calculated metrics.
          The metrics include:
          - avg_total_mnt_spent: Average total monetary spent.
          - std_total_mnt_spent: Standard deviation of total monetary spent.
          - range_total_mnt_spent: Range (max - min) of total monetary spent.
          - avg_total_num_purchases: Average total number of purchases.
          - std_total_num_purchases: Standard deviation of total number of purchases.
          - range_total_num_purchases: Range (max - min) of total number of purchases.
          - response_rate: Response rate as a percentage.
          - no_customers: Number of customers.
          - sum_purchases: Sum of total number of purchases.
    """
    data_metrics = {}

    for level, df in data.items():
        avg_total_mnt_spent = round(df['TotalMntSpent'].mean(), 1)
        std_total_mnt_spent = round(df['TotalMntSpent'].std(), 1)
        range_total_mnt_spent = round(df['TotalMntSpent'].max() - df['TotalMntSpent'].min(), 1)
        
        avg_total_num_purchases = round(df['TotalNumPurchases'].mean(), 1)
        std_total_num_purchases = round(df['TotalNumPurchases'].std(), 1)
        range_total_num_purchases = round(df['TotalNumPurchases'].max() - df['TotalNumPurchases'].min(), 0)
        
        response_rate = round((df['Response'].sum() / len(df)) * 100, 1)
        no_customers = len(df)
        sum_purchases = df['TotalNumPurchases'].sum()
        
        data_metrics[level] = {
            'avg_total_mnt_spent': avg_total_mnt_spent,
            'std_total_mnt_spent': std_total_mnt_spent,
            'range_total_mnt_spent': range_total_mnt_spent,
            'avg_total_num_purchases': avg_total_num_purchases,
            'std_total_num_purchases': std_total_num_purchases,
            'sum_purchases': sum_purchases,
            'range_total_num_purchases': range_total_num_purchases,
            'response_rate': response_rate,
            'no_customers': no_customers
        }
    
    return data_metrics

In [15]:
education_metrics = calculate_metrics(data_by_education)
education_metrics['Graduation']

{'avg_total_mnt_spent': 619.9,
 'std_total_mnt_spent': 599.5,
 'range_total_mnt_spent': 2519,
 'avg_total_num_purchases': 15.0,
 'std_total_num_purchases': 7.6,
 'sum_purchases': 16894,
 'range_total_num_purchases': 43,
 'response_rate': 13.5,
 'no_customers': 1127}