# Import Libraries

In [None]:
# data manipulation and viz
import datetime
import math
import random
import numpy as np
import pandas as pd
import seaborn as sns
from numpy                         import log
from matplotlib                    import pyplot as plt
import pickle
from matplotlib.ticker             import PercentFormatter
import matplotlib as mpl
# stats tests
from scipy                         import stats as stats
import statsmodels.formula.api as smf

# notebook settings
from IPython.core.display          import HTML
from IPython.display               import Image
import matplotlib.font_manager as font_manager
from pathlib import Path


import warnings
warnings.filterwarnings('ignore')

# Helper Functions

In [None]:
def jupyter_settings():
    """
    Settings for best jupyter layout
    """
    %matplotlib inline
#     %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 21
    
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = 150
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', -1)
    pd.set_option( 'display.expand_frame_repr', False )    
    pd.options.display.float_format = '{:,.4f}'.format
    sns.set_style('white')
    sns.set(font_scale=2)

jupyter_settings()


def get_descriptive_statistics(data):
    """
    Get descriptive statistics of a dataframe. 
    It only works for int or float variables - for categorical vars, use a boxplot.

    data: pandas dataframe containing numerical features
    """
    #### Central Tendency 
    # 1st moment (mean)
    ct_mean = pd.DataFrame(data.apply( np.mean )).T
    # median
    ct_median = pd.DataFrame(data.apply( np.median )).T

    #### Dispersion
    # 2nd moment (variance)
    d_var = pd.DataFrame(data.apply( np.var)).T
    # Standard Deviation
    d_std = pd.DataFrame(data.apply( np.std)).T
    # min
    d_min = pd.DataFrame(data.apply(min)).T
    # max
    d_max = pd.DataFrame(data.apply(max)).T
    # range
    d_range = pd.DataFrame(data.apply(lambda x: x.max() - x.min())).T
    # 3rd moment (Skew)
    d_sk = pd.DataFrame(data.apply(lambda x: x.skew())).T
    # 4th moment (Kurtosis)
    d_kurt = pd.DataFrame(data.apply(lambda x: x.kurtosis())).T
    # Q1 quantile
    d_q1 = pd.DataFrame(data.apply(lambda x: np.quantile(x, .25))).T
    # Q3 quantile
    d_q3 = pd.DataFrame(data.apply(lambda x: np.quantile(x, .75))).T
    # 95th
    d_95 = pd.DataFrame(data.apply(lambda x: np.quantile(x, .95))).T
    # concatenate
    m = pd.concat([d_min, d_max, d_range, ct_mean, d_q1, ct_median, d_q3, d_95, d_std, d_sk, d_kurt]).T.reset_index()
    m.columns = ['attributes', 'min', 'max','range','mean','25%', '50%','75%','95%','std', 'skew', 'kurtosis']
    return m

def IQR(var, df):
    """
    Calculate the lower and upper fence of a variable's boxplot. It takes two arguments:
    var: str, name of the variable (should be used with quotes)
    df: pandas dataframe containing the var
    """
    Q1 = np.quantile(df[var], .25)
    Q3 = np.quantile(df[var], .75)
    IQR = Q3 - Q1
    upper_fence = Q3 + (1.5 * IQR)
    lower_fence = Q1 - (1.5 * IQR)

    return print('For variable {}, upper fence is {} and lower fence is {}.'.format(var, upper_fence, lower_fence))

# set palette for plots
palette_types = ['#053D4E','#9EA4AC','#51C1C3', '#9EA4AC'] # dark, cyan, grey
palette_topics = ['#053D4E','#EF4E23', '#51C1C3', '#9EA4AC'] # dark, orange, cyan, grey (old version)
palette_topics = ['#9EA4AC','#9EA4AC', '#9EA4AC', '#9EA4AC'] # grey (ppt version)
for n in range(0, 15):
    palette_topics = palette_topics + ['#9EA4AC']
# palette_segment = {'NONE': '#053D4E','SEED': '#9EA4AC', 'SMALL': '#51C1C3', 'MEDIUM':'#EF4E23','LARGE':'#053D4E'} # dark, grey, cyan, orange, dark
palette_bins = ['#9EA4AC'] 
for n in range(0, 24):
    if n < 19:
        palette_bins = palette_bins + ['#9EA4AC'] # grey
    else:
        palette_bins = palette_bins + ['#51C1C3'] # cyan


    
list_topics = ['ANIMAIS E  PETS',
 'AUTOCONHECIMENTO E ESPIRITUALIDADE',
 'CARREIRA E DESENVOLVIMENTO PESSOAL',
 'CULINÁRIA E GASTRONOMIA',
 'DESIGN E FOTOGRAFIA',
 'EDUCAÇÃO INFANTIL E FAMÍLIA',
 'ENGENHARIA E ARQUITETURA',
 'ENSINO E ESTUDO ACADÊMICO',
 'FINANÇAS E NEGÓCIOS',
 'HOBBIES E LAZER',
 'MANUTENÇÃO DE EQUIPAMENTOS',
 'MARKETING E VENDAS',
 'MODA E BELEZA',
 'MÚSICA E ARTES',
 'PLANTAS E ECOLOGIA',
 'RELACIONAMENTOS',
 'SAÚDE E ESPORTES',
 'TECNOLOGIA E DESENVOLVIMENTO DE SOFTWARE', 'SPARKLE']

palette_two = {'single-purchase': '#9EA4AC', 'repurchase': '#EF4E23'} # grey, orange
palette_above = {'below': '#9EA4AC', 'above': '#EF4E23'} # grey, orange

millnames = ['',' K',' MM',' B',' T']

def millify(n):
    n = float(n)
    millidx = max(0,min(len(millnames)-1,
                        int(math.floor(0 if n == 0 else math.log10(abs(n))/3))))

    return '{:.0f}{}'.format(n / 10**(3 * millidx), millnames[millidx])


light = Path(mpl.get_data_path(), '/Users/alan.maehara/Downloads/nunito-sans/NunitoSans-Regular.ttf')

# Add every font at the specified location
font_dir = ['/Users/alan.maehara/Downloads/nunito-sans']
for font in font_manager.findSystemFonts(font_dir):
    print(font)
    font_manager.fontManager.addfont(font)

# Set font family globally
plt.rcParams['font.family'] = 'nunito-sans'

# Load Dataset

In [None]:
# aux2 = aux1[['is_below_10_exact', 'producer_id']].groupby('is_below_10_exact').count().reset_index()
# aux2['%'] = aux2['producer_id'] / aux2['producer_id'].sum() * 100
# aux2

In [None]:
# aux3 = aux1[['is_below_10_mean', 'producer_id']].groupby('is_below_10_mean').count().reset_index()
# aux3['%'] = aux3['producer_id'] / aux3['producer_id'].sum() * 100
# aux3

In [None]:
elastic = pd.read_pickle('data/elastic.pkl')
club = pd.read_pickle('data/club.pkl')
club = club[['user_buyer_id', 'product_id', 'is_course_complete',
       'completion_course_date', 'join_course_date', 'membership_status',
       'last_access']].drop_duplicates()
order = pd.read_csv('data/orderbump.csv', delimiter=';')
order = order.drop_duplicates()
dev = pd.read_csv('data/device.csv', delimiter=';')
df = pd.read_csv('data/purchase_2022.csv', delimiter=';')
prod = pd.read_csv('data/producers_ltv.csv', delimiter=';')
prod.columns = ['producer_id', 'gmv_brl_exact', 'gmv_brl_entire', 'gmv_2021', 'gmv_2022']
print(f"size devices: {len(dev):,}")
print(f"size elastic: {len(elastic):,}")
print(f"size club: {len(club):,}")
print(f"size purchases: {len(df):,}")
print(f"size duplicates: {len(dup):,}")
print(f"size prod: {len(prod):,}")
df = df.merge(dev, how = 'left', on = 'purchase_transaction')
df = df.merge(order, how = 'left', on = 'purchase_id')
df = df.merge(elastic, how = 'left', on = ['user_buyer_id', 'product_id'])
df = df.merge(club, how = 'left', on = ['user_buyer_id', 'product_id'])
df = df.drop(['purchase_payment_engine', 'club_access_type', 'club_quantity_dependents'], axis =1).drop_duplicates()
print(f"size after join: {len(df):,}")
df.head()

In [None]:
dev.head()

In [None]:
dup = pd.read_csv('data/duplicates.csv', delimiter=';')
# remove entries without more than one user id
dup = dup[dup['user2_date'].notnull()]

# remove invalid users - these users should not be in the df
dup['out'] = 0 
dup.loc[(dup['user1_date'] < '2021-07-01') & (dup['user1_date'].notnull()), 'out'] = 1
dup.loc[(dup['user2_date'] < '2021-07-01') & (dup['user2_date'].notnull()), 'out'] = 1
dup.loc[(dup['user3_date'] < '2021-07-01') & (dup['user3_date'].notnull()), 'out'] = 1
dup.loc[(dup['user4_date'] < '2021-07-01') & (dup['user4_date'].notnull()), 'out'] = 1
dup.loc[(dup['user5_date'] < '2021-07-01') & (dup['user5_date'].notnull()), 'out'] = 1
dup.loc[(dup['user6_date'] < '2021-07-01') & (dup['user6_date'].notnull()), 'out'] = 1

dup['user1_date'] = pd.to_datetime(dup['user1_date'] )
dup['user2_date'] = pd.to_datetime(dup['user2_date'] )
dup['user3_date'] = pd.to_datetime(dup['user3_date'] )
dup['user4_date'] = pd.to_datetime(dup['user4_date'] )
dup['user5_date'] = pd.to_datetime(dup['user5_date'] )
dup['user6_date'] = pd.to_datetime(dup['user6_date'] )
# dup['user1'] = dup['user1'].astype(int)
# dup['user2'] = dup['user2'].astype(int)
# dup['user3'] = dup['user3'].astype(int)
# dup['user4'] = dup['user4'].astype(int)
# dup['user5'] = dup['user5'].astype(int)
# dup['user6'] = dup['user6'].astype(int)


In [None]:

df = df.merge(dup, on = 'purchase_transaction', how = 'left').drop_duplicates()

# STEP 01 - DATA PREPARATION 

## Data Dimensions

In [None]:
# df.to_pickle('data/df.pkl')
# dup.to_pickle('data/dup_cleaned.pkl')
# # read pickle file as dataframe
df1 = pd.read_pickle('data/df.pkl')
dup = pd.read_pickle('data/dup_cleaned.pkl')
first = pd.read_csv('data/first_purchase_per_product.csv')
first['remove_first_entry'] = 1
df1 = df1.merge(first, on = 'purchase_id', how = 'left')

In [None]:
# check dataset shape and column names
print(f'The dataset has {df1.shape[0]:,} rows and {df1.shape[1]} columns.')
print(f'Columns: {df1.columns}')

## Remove inconsistencies

In [None]:
# get original df size
orig_shape = len(df1)
count_outliers = len(df1)

# remove inconsistencies 
df1 = df1[(df1['user_type'].notnull())]
print(f'Outliers removed: {(count_outliers-len(df1)):,} ')
count_outliers = len(df1)
df1 = df1[~((df1['purchase_id'].duplicated(keep=False)) & ((df1['subscription_feature_type'] == 'MIGRATION') | (df1['subscription_feature_type'] == 'MONTHLY_PLAN_AS_ANNUAL')) )] # remove entries with duplicates
print(f'Outliers removed: {(count_outliers-len(df1)):,} ')
count_outliers = len(df1)
df1 = df1[(df1['avg_rating'] != 0)]
print(f'Outliers removed: {(count_outliers-len(df1)):,} ')
count_outliers = len(df1)
df1 = df1[~(df1['subs_value'] < 0)]
print(f'Outliers removed: {(count_outliers-len(df1)):,} ')
count_outliers = len(df1)

### replace userid and user creation date with the oldest entry
df1['user_creation_datetime'] = pd.to_datetime(df1['user_creation_datetime'])
df1['user1_date'] = pd.to_datetime(df1['user1_date'])
aux1 = df1[(df1['user_creation_datetime'] > df1['user1_date'])][['user_buyer_id', 'user1', 'user1_date', 'prod_signup_datetime']].drop_duplicates()
aux1.columns = ['user_buyer_id', 'user', 'user_date', 'producer_signup_datetime']
aux1['user'] = aux1['user'].astype(int)
aux1['rank'] = aux1.groupby("user_buyer_id")["user_date"].rank(method="first", ascending=True)
aux1 = aux1[aux1['rank'] == 1]
df1 = df1.merge(aux1, on = 'user_buyer_id', how = 'left')
df1.loc[df1['user'].notnull(), 'user_creation_datetime'] = df1['user_date']
df1.loc[df1['user'].notnull(), 'user_creation_date'] = df1['user_date'].dt.date
df1.loc[df1['user'].notnull(), 'user_buyer_id'] = df1['user']
df1.loc[df1['user'].notnull(), 'prod_signup_datetime'] = df1['producer_signup_datetime']
df1['user_creation_date'] = df1['user_creation_date'].astype(str)

### remove users with creation date above purchase date
unique_users = list(set(df1[(df1['user_creation_date'] > df1['purchase_order_date'])]['user_buyer_id'].drop_duplicates()))
aux2 = pd.DataFrame(data={'user_buyer_id': unique_users})
aux2['user_buyer_id'] = aux2['user_buyer_id'].astype(int)
aux2['remove'] = 1
df1 = df1.merge(aux2, how = 'left', on = 'user_buyer_id')
df1 = df1[~(df1['remove'] == 1)]
print(f'Outliers removed: {(count_outliers-len(df1)):,} ')
count_outliers = len(df1)

In [None]:
### flag users who should be removed due to old user_creation_date (before 2021-07-01)
aux1 = dup[dup['out'] == 1][['user1', 'user2', 'user3', 'user4', 'user5', 'user6']]
unique_users = []
for n in ['user1', 'user2', 'user3', 'user4', 'user5', 'user6']:
    unique_users += list(aux1[aux1[n].notnull()][n])
unique_users = list(set(unique_users))
aux2 = pd.DataFrame(data={'user_buyer_id': unique_users})
aux2['user_buyer_id'] = aux2['user_buyer_id'].astype(int)
aux2['remove_old_entry'] = 1
df1 = df1.merge(aux2, how = 'left', on = 'user_buyer_id')
df1 = df1[~(df1['remove_old_entry'] == 1)]
print(f'Outliers removed: {(count_outliers-len(df1)):,} ')
count_outliers = len(df1)

# remove na segment
unique_prod = list(set(df1[df1['segment'].isna()]['producer_id']))
aux2 = pd.DataFrame(data={'producer_id': unique_prod})
aux2['producer_id'] = aux2['producer_id'].astype(int)
aux2['remove_na_segment'] = 1
df1 = df1.merge(aux2, how = 'left', on = 'producer_id')
df1 = df1[~(df1['remove_na_segment'] == 1)]
print(f'Outliers removed: {(count_outliers-len(df1)):,} ')
count_outliers = len(df1)

# remove na segmentation_final_name
unique_prod = list(set(df1[df1['segmentation_final_name'].isna()]['producer_id']))
aux2 = pd.DataFrame(data={'producer_id': unique_prod})
aux2['producer_id'] = aux2['producer_id'].astype(int)
aux2['remove_na_segmentation_final_name'] = 1
df1 = df1.merge(aux2, how = 'left', on = 'producer_id')
df1 = df1[~(df1['remove_na_segmentation_final_name'] == 1)]
print(f'Outliers removed: {(count_outliers-len(df1)):,} ')
count_outliers = len(df1)

# remove null order_bump_type
df1 = df1[df1['order_bump_type'].notnull()]
print(f'Outliers removed: {(count_outliers-len(df1)):,} ')
count_outliers = len(df1)


In [None]:
# remove cols
cols = ['origin_datetime',
       'user1_date', 'user2_date', 'user3_date', 'user4_date',
       'user5_date', 'user6_date', 'user1', 'user2', 'user3', 'user4', 'user5',
       'user6', 'out', 'user', 'user_date', 'rank', 'remove',
       'remove_old_entry', 'remove_na_segment', 'producer_signup_datetime', 'remove_na_segmentation_final_name']
df1 = df1.drop(cols, axis = 1)

In [None]:
# check dataset shape and column names
print(f'Outliers removed: {(orig_shape-len(df1)):,} ')
print(f'The dataset has {df1.shape[0]:,} rows and {df1.shape[1]} columns.')

In [None]:
df3['user_creation_datetime'].min()

In [None]:
# df1['user_creation_datetime'].min()
df1['user_creation_datetime'].max()

In [None]:
# df1['user_creation_date'].min()
df1['user_creation_date'].max()

In [None]:
# df1['purchase_order_datetime'].min()
df1['purchase_order_datetime'].max()

In [None]:
# df1['purchase_order_date'].min()
df1['purchase_order_date'].max()

In [None]:
# df1['purchase_release_datetime'].min()
df1['purchase_release_datetime'].max()

In [None]:
# df1['purchase_release_date'].min()
df1['purchase_release_date'].max()

In [None]:
df3

In [None]:
df1.loc[(df1['user_creation_date'] > df1['purchase_release_date']), ['user_buyer_id', 'user_creation_datetime', 'purchase_release_datetime', 'purchase_release_date']]

## Check & Fill out NA

In [None]:
# sum up all nulls across columns
df1.isna().sum()

In [None]:
df1[(df1['is_subs'] == 'single-payment')]['subscription_id'].value_counts(dropna=False)

In [None]:
# impute data
df1.loc[df1['device'].isna(), 'device'] = 'NA'
df1.loc[df1['product_category'].isna(), 'product_category'] = 'Unknown'
df1.loc[(df1['avg_rating'].isna()), 'avg_rating'] = 0
df1.loc[(df1['total_answers_rating'].isna()), 'total_answers_rating'] = 0
df1.loc[(df1['recurrency_number'].isna()), 'recurrency_number'] = 0
df1.loc[(df1['subs_type'].isna()), 'subs_type'] = 0
df1.loc[(df1['subs_value'].isna()), 'subs_value'] = 0
df1.loc[(df1['subs_due_day'].isna()), 'subs_due_day'] = 0
df1.loc[(df1['subs_status'].isna()), 'subs_status'] = 'NA'
df1.loc[(df1['subscription_feature_type'].isna()), 'subscription_feature_type'] = 'NA'
df1.loc[(df1['coupon_discount_value'].isna()), 'coupon_discount_value'] = 0
df1.loc[df1['payment_method_description'].isna(), 'payment_method_description'] = 'Unknown'


In [None]:
# sum up all nulls across columns
df1.isna().sum()

## Change Data Types

In [None]:
# check data types
df1.dtypes

In [None]:
# transform variable 'date' to a date type variable
df1['user_creation_datetime'] = pd.to_datetime(df1['user_creation_datetime'])
df1['prod_signup_datetime'] = pd.to_datetime(df1['prod_signup_datetime'])
df1['purchase_release_datetime'] = pd.to_datetime(df1['purchase_release_datetime'])
df1['purchase_order_datetime'] = pd.to_datetime(df1['purchase_order_datetime'])
df1['subs_start_datetime'] = pd.to_datetime(df1['subs_start_datetime'])
df1['subs_cancellation_datetime'] = pd.to_datetime(df1['subs_cancellation_datetime'])
df1['subs_last_payment_datetime'] = pd.to_datetime(df1['subs_last_payment_datetime'])
df1['subs_last_payment_tentative_date'] = pd.to_datetime(df1['subs_last_payment_tentative_date'])
df1['recurrency_number'] = df1['recurrency_number'].astype(int)
df1['subs_due_day'] = df1['subs_due_day'].astype(int)
df1['subs_value'] = df1['subs_value'].astype(int)
df1['subs_type'] = df1['subs_type'].astype(int)

In [None]:
# re-check data types
df1.dtypes

## Descriptive Statistics

In [None]:
df1['category'].unique()

Descriptive statistics were computed for numerical features and categorical features.

In [None]:
# separate numerical and categorical features
num_feat = df1[['purchase_installment_number', 'total_answers_rating', 'avg_rating', 'gmv_value_brl', 'subs_value', 'subs_due_day', 'subs_type', 'recurrency_number', 'coupon_discount_value', 'progress_club']]
# cat_feat = df1.select_dtypes( exclude = ['int64','float64','datetime64[ns]'])
cat_feat = df1[['user_type', 'user_country', 'user_office_name', 'purchase_status', 'purchase_payment_type', 'product_category', 'has_rating', 'category', 'is_club', 'is_subs', 'subs_status', 'has_coupon', 'device', 'subscription_feature_type', 'segment', 'segmentation_final_name', 'order_bump_type', 'payment_method_description', 'purchase_sale_type']]

### Numerical Features

In [None]:
# get statistics
get_descriptive_statistics(num_feat)

### Categorical Features

In [None]:
# check unique values of each feature
cat_feat.apply(lambda x: x.nunique())

In [None]:
plt.figure(figsize = (10,15))
aux1 = df1[['user_country', 'purchase_id']].groupby('user_country').count().reset_index().sort_values('purchase_id', ascending=False)[0:20]
sns.set_style('ticks')
splot = sns.barplot(y = 'user_country', x = 'purchase_id', data = aux1, palette = palette_topics)
plt.xlim(0, 9000000)
for p in splot.patches:
    splot.annotate("%.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')


In [None]:
plt.figure(figsize = (10,15))
aux1 = df1[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)
sns.set_style('ticks')
splot = sns.barplot(y = 'user_office_name', x = 'purchase_id', data = aux1, palette = palette_topics)
plt.xlim(0, 9000000)
for p in splot.patches:
    splot.annotate("%.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')


In [None]:
plt.figure(figsize = (10,15))
aux1 = df1[['category', 'purchase_id']].groupby('category').count().reset_index().sort_values('purchase_id', ascending=False)[0:20]
sns.set_style('ticks')
splot = sns.barplot(y = 'category', x = 'purchase_id', data = aux1, palette = palette_topics)
plt.xlim(0, 9000000)
for p in splot.patches:
    splot.annotate("%.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')

In [None]:
plt.figure(figsize = (10,15))
aux1 = df1[['product_category', 'purchase_id']].groupby('product_category').count().reset_index().sort_values('purchase_id', ascending=False)[0:20]
sns.set_style('ticks')
splot = sns.barplot(y = 'product_category', x = 'purchase_id', data = aux1, palette = palette_topics)
plt.xlim(0, 9000000)
for p in splot.patches:
    splot.annotate("%.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')

In [None]:
plt.figure(figsize = (10,15))
aux1 = df1[['purchase_payment_type', 'purchase_id']].groupby('purchase_payment_type').count().reset_index().sort_values('purchase_id', ascending=False)[0:20]
sns.set_style('ticks')
splot = sns.barplot(y = 'purchase_payment_type', x = 'purchase_id', data = aux1, palette = palette_topics)
plt.xlim(0, 9000000)
for p in splot.patches:
    splot.annotate("%.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')

In [None]:
plt.figure(figsize = (10,15))
aux1 = df1[['purchase_status', 'purchase_id']].groupby('purchase_status').count().reset_index().sort_values('purchase_id', ascending=False)[0:20]
sns.set_style('ticks')
splot = sns.barplot(y = 'purchase_status', x = 'purchase_id', data = aux1, palette = palette_topics)
plt.xlim(0, 12000000)
for p in splot.patches:
    splot.annotate("%.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')

In [None]:
plt.figure(figsize = (10,15))
aux1 = df1[['subs_status', 'purchase_id']].groupby('subs_status').count().reset_index().sort_values('purchase_id', ascending=False)[0:20]
sns.set_style('ticks')
splot = sns.barplot(y = 'subs_status', x = 'purchase_id', data = aux1, palette = palette_topics)
plt.xlim(0, 12000000)
for p in splot.patches:
    splot.annotate("%.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')

In [None]:
plt.figure(figsize = (10,15))
aux1 = df1[['payment_method_description', 'purchase_id']].groupby('payment_method_description').count().reset_index().sort_values('purchase_id', ascending=False)
sns.set_style('ticks')
splot = sns.barplot(y = 'payment_method_description', x = 'purchase_id', data = aux1, palette = palette_topics)
plt.xlim(0, 12000000)
for p in splot.patches:
    splot.annotate("%.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')

In [None]:
plt.figure(figsize = (10,15))
aux1 = df1[['purchase_sale_type', 'purchase_id']].groupby('purchase_sale_type').count().reset_index().sort_values('purchase_id', ascending=False)
sns.set_style('ticks')
splot = sns.barplot(y = 'purchase_sale_type', x = 'purchase_id', data = aux1, palette = palette_topics)
plt.xlim(0, 12000000)
for p in splot.patches:
    splot.annotate("%.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')

In [None]:
plt.figure(figsize = (10,15))
aux1 = df1[['order_bump_type', 'purchase_id']].groupby('order_bump_type').count().reset_index().sort_values('purchase_id', ascending=False)
sns.set_style('ticks')
splot = sns.barplot(y = 'order_bump_type', x = 'purchase_id', data = aux1, palette = palette_topics)
plt.xlim(0, 12000000)
for p in splot.patches:
    splot.annotate("%.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')

In [None]:
# plotting boxplots (categorical variables against target variable)
plt.figure(figsize = (20,30))
for d, n in enumerate(['user_type',  'has_rating', 'is_club', 'is_subs', 'has_coupon', 'device', 'subscription_feature_type', 'segment', 'segmentation_final_name']):
    plt.subplot(5,2,d+1)
    aux1 = df1[[n, 'purchase_id']].groupby(n).count().reset_index().sort_values('purchase_id', ascending=False)
    splot = sns.barplot(y = n, x = 'purchase_id', data = aux1, palette = palette_topics)
    for p in splot.patches:
        splot.annotate("%.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.subplots_adjust(hspace=0.8)
    plt.subplots_adjust(wspace=0.8)
    plt.xlim(0, 10000000)

## Outlier Analysis/Remove Inconsistencies

In [None]:
# df1.to_pickle('data/df1.pkl')
# # read pickle file as dataframe
df1 = pd.read_pickle('data/df1.pkl')

In [None]:
df1['user_buyer_id'].nunique()

In [None]:
print(f"Transactions: {len(df1):,}")
count_df = len(df1)

### Remove first purchase of every recurrency product (excl. recurrency cases with more than one transaction)

In [None]:
### recurrency case
aux1 = df1[(df1['remove_first_entry'] == 1) & (df1['is_subs'] == 'subscription')][['user_buyer_id', 'subscription_id']].drop_duplicates()
aux2 = df1.merge(aux1, how = 'inner', on = ['user_buyer_id', 'subscription_id'])
aux2 = aux2[['user_buyer_id', 'subscription_id', 'purchase_id']].groupby(['user_buyer_id', 'subscription_id']).count().reset_index()
aux2 = aux2[aux2['purchase_id'] == 1][['user_buyer_id', 'subscription_id']]
aux2['remove'] = 1
df1 = df1.merge(aux2, how = 'left',  on = ['user_buyer_id', 'subscription_id'])
df1['remove_entry'] = 0
df1.loc[df1['remove_first_entry'] == df1['remove'], 'remove_entry'] = 1

In [None]:
df1[(df1['remove_first_entry'] == 1) & (df1['is_subs'] == 'subscription')][['remove_first_entry']].value_counts(dropna=False)

In [None]:
df1['remove'].value_counts(dropna=False)

In [None]:
df1['remove_entry'].value_counts(dropna=False)

In [None]:
# remove users with zero gmv and not recurrency
orig_shape = len(df1)
df1 = df1[df1['remove_entry'] != 1].drop(['remove', 'remove_entry'], axis = 1)
# check dataset shape and column names
print(f'Outliers removed: {(orig_shape-len(df1)):,} ({((orig_shape-len(df1)) / orig_shape * 100):.2f}%) ')
print(f'The dataset has {df1.shape[0]:,} rows and {df1.shape[1]} columns.')

### Remove first purchase of every product in which smart_installments were used (excl. smart_installments cases with more than one transaction)

In [None]:
### recurrency case
aux1 = df1[(df1['remove_first_entry'] == 1) & (df1['subscription_feature_type'] == 'SMART_INSTALLMENT')][['user_buyer_id', 'subscription_id']].drop_duplicates()
aux2 = df1.merge(aux1, how = 'inner', on = ['user_buyer_id', 'subscription_id'])
aux2 = aux2[['user_buyer_id', 'subscription_id', 'purchase_id']].groupby(['user_buyer_id', 'subscription_id']).count().reset_index()
aux2 = aux2[aux2['purchase_id'] == 1][['user_buyer_id', 'subscription_id']]
aux2['remove'] = 1
df1 = df1.merge(aux2, how = 'left',  on = ['user_buyer_id', 'subscription_id'])
df1['remove_entry'] = 0
df1.loc[df1['remove_first_entry'] == df1['remove'], 'remove_entry'] = 1

In [None]:
df1[(df1['remove_first_entry'] == 1) & (df1['subscription_feature_type'] == 'SMART_INSTALLMENT')][['remove_first_entry']].value_counts(dropna=False)

In [None]:
df1['remove'].value_counts(dropna=False)

In [None]:
df1['remove_entry'].value_counts(dropna=False)

In [None]:
# remove users with zero gmv and not recurrency
orig_shape = len(df1)
df1 = df1[df1['remove_entry'] != 1].drop(['remove', 'remove_entry'], axis = 1)
# check dataset shape and column names
print(f'Outliers removed: {(orig_shape-len(df1)):,} ({((orig_shape-len(df1)) / orig_shape * 100):.2f}%) ')
print(f'The dataset has {df1.shape[0]:,} rows and {df1.shape[1]} columns.')

### Remove first purchase of every product (all single-cases)

In [None]:
### recurrency case
orig_shape = len(df1)
df1 = df1[~((df1['remove_first_entry'] == 1) & (df1['is_subs'] == 'single-payment') & (df1['subscription_feature_type'] != 'SMART_INSTALLMENT'))].drop('remove_first_entry', axis = 1)
# check dataset shape and column names
print(f'Outliers removed: {(orig_shape-len(df1)):,} ({((orig_shape-len(df1)) / orig_shape * 100):.2f}%) ')
print(f'The dataset has {df1.shape[0]:,} rows and {df1.shape[1]} columns.')

### products with zero rating (keep - inconclusive)

In [None]:
# df1[['product_id', 'total_answers_rating']].drop_duplicates(keep='first').sort_values('total_answers_rating', ascending=False).head(20)

### purchases with zero gmv (excl. recurrencies)

In [None]:
 
df1[df1['gmv_value_brl'] == 0].value_counts('recurrency_number',dropna=False)

In [None]:
df1[df1['gmv_value_brl'] == 0].value_counts('subscription_feature_type',dropna=False)

In [None]:
len(df1[~((df1['gmv_value_brl'] == 0) & (df1['is_subs'] == 'single-payment'))])

In [None]:
# remove users with zero gmv and not recurrency
orig_shape = len(df1)
df1 = df1[~((df1['gmv_value_brl'] == 0) & (df1['is_subs'] == 'single-payment'))]
# check dataset shape and column names
print(f'Outliers removed: {(orig_shape-len(df1)):,} ({((orig_shape-len(df1)) / orig_shape * 100):.2f}%) ')
print(f'The dataset has {df1.shape[0]:,} rows and {df1.shape[1]} columns.')

### Remove recurrency transactions (keep only first entry, retrieve total number of recurrencies from last entry). If trial, get last gmv_value_brl entry

In [None]:
orig_shape = len(df1)
# get df
aux0 = df1[df1['is_subs'] == 'subscription']
# get inverse order purchase
aux0['order_purchase_inv'] = aux0.groupby(["user_buyer_id", "subscription_id"])["purchase_order_datetime"].rank(method="first", ascending=False)
aux1 = aux0[aux0['recurrency_number'] == 1][['purchase_id', 'user_buyer_id', 'subscription_id']] # get first entry
aux1['remove'] = 0
aux2 = aux0[aux0['order_purchase_inv'] == 1][['subscription_id', 'user_buyer_id', 'recurrency_number', 'gmv_value_brl']] # get last entry
aux3 = aux2.merge(aux1, how = 'left', on = ['user_buyer_id', 'subscription_id'])
aux3.columns = ['subscription_id', 'user_buyer_id', 'total_recurrencies',
       'last_gmv_value_brl', 'purchase_id', 'remove']
aux3.loc[aux3['remove'].isna(), 'remove'] = 1
df1 = df1.merge(aux3[['total_recurrencies', 'last_gmv_value_brl', 'purchase_id', 'remove']].drop_duplicates(), how = 'left', on = ['purchase_id'])
aux3['removal'] = 1
df1 = df1.merge(aux3[['user_buyer_id', 'subscription_id', 'removal']].drop_duplicates(), how = 'left', on = ['user_buyer_id', 'subscription_id'])
df1 = df1[~((df1['is_subs'] == 'subscription') & (df1['removal'] == 1) & (df1['remove'] != 0))]
df1.loc[(df1['is_subs'] == 'subscription') & (df1['subscription_feature_type'] == 'TRIAL') & (df1['gmv_value_brl'] == 0), 'gmv_value_brl'] = df1['last_gmv_value_brl']
df1 = df1.drop(['remove', 'removal', 'last_gmv_value_brl'], axis =1)
# check dataset shape and column names
print(f'Outliers removed: {(orig_shape-len(df1)):,} ({((orig_shape-len(df1)) / orig_shape * 100):.2f}%) ')
print(f'The dataset has {df1.shape[0]:,} rows and {df1.shape[1]} columns.')

In [None]:
# aux3[(aux3['user_buyer_id'] == 41296009 )]#[['purchase_id', 'user_buyer_id', 'product_id', 'is_subs', 'subs_type', 'purchase_order_datetime', 'order_purchase_inv', 'recurrency_number', 'subs_last_payment_datetime']]
# aux0[(aux0['user_buyer_id'] == 41296009 )][['purchase_id', 'user_buyer_id', 'product_id', 'is_subs', 'subs_type', 'purchase_order_datetime', 'order_purchase_inv', 'recurrency_number', 'subs_last_payment_datetime']]
# aux0[(aux0['user_buyer_id'] == 41366514 )][['purchase_id', 'user_buyer_id', 'product_id', 'is_subs', 'subs_type', 'purchase_order_datetime', 'order_purchase_inv', 'recurrency_number', 'subs_last_payment_datetime']]

In [None]:
df1[df1['is_subs'] == 'subscription']['recurrency_number'].value_counts(dropna=False)

### Remove smart_installments transactions (keep only first entry)

In [None]:
orig_shape = len(df1)
# get df
aux0 = df1[df1['subscription_feature_type'] == 'SMART_INSTALLMENT']
# get order purchase
aux0['order_purchase'] = aux0.groupby(["user_buyer_id", "product_id"])["purchase_order_datetime"].rank(method="first", ascending=True)
# get count of purchases per user and product
aux1 = aux0[['user_buyer_id', 'product_id', 'purchase_id']].groupby(['user_buyer_id', 'product_id']).count().reset_index()
aux1.columns = ['user_buyer_id', 'product_id', 'count_purchase']
aux0 = aux0.merge(aux1, how = 'left', on = ['user_buyer_id', 'product_id'])
# getr users who has purchase status complete
aux1 = aux0[aux0['purchase_status'] == 'COMPLETO'][['user_buyer_id', 'product_id']].drop_duplicates()
aux1['has_complete'] = 1
aux0 = aux0.merge(aux1, how = 'left', on = ['user_buyer_id', 'product_id'])
# if 1st purchase is not COMPLETO, invalid
aux0.loc[(aux0['count_purchase'] > 1) & (aux0['order_purchase'] == 1) & (aux0['purchase_status'] != 'COMPLETO') & (aux0['has_complete'] == 1), 'order_purchase'] = 10

In [None]:
aux0.order_purchase.value_counts()

In [None]:
aux0 = aux0[(aux0['order_purchase'] != 1)][['purchase_id']]
aux0['remove_smart'] = 1
df1 = df1.merge(aux0, how = 'left', on = 'purchase_id')
df1 = df1[df1['remove_smart'] != 1]
df1 = df1.drop('remove_smart', axis = 1 )
# check dataset shape and column names
print(f'Outliers removed: {(orig_shape-len(df1)):,} ({((orig_shape-len(df1)) / orig_shape * 100):.2f}%) ')
print(f'The dataset has {df1.shape[0]:,} rows and {df1.shape[1]} columns.')

### Order Bump - Remove order_bump_child (if recurrency, remove all)

In [None]:
df1[df1['user_buyer_id'] == 41366514][['subscription_id', 'user_buyer_id', 'product_id', 'purchase_order_datetime', 'subscription_feature_type']].sort_values('purchase_order_datetime')

In [None]:
df1[['order_bump_type']].value_counts(dropna=False, normalize=False) 

In [None]:
df1[df1['order_bump_type'] == 'ORDER_BUMP_CHILD'][['is_subs']].value_counts(dropna=False)

In [None]:
df1[df1['order_bump_type'] == 'ORDER_BUMP_CHILD'][['subscription_feature_type']].value_counts(dropna=False)

In [None]:
df1[df1['order_bump_type'] == 'ORDER_BUMP_CHILD'][['product_category']].value_counts(dropna=False)

In [None]:
### remove all order_bump_child from single-payments (excl smart installments)
orig_shape = len(df1)
aux0 = df1[(df1['order_bump_type'] == 'ORDER_BUMP_CHILD') & (df1['subscription_feature_type'] != 'SMART_INSTALLMENT') & (df1['is_subs'] != 'subscription')]
print(f"Number purchases {len(aux0):,} ({(len(aux0) / len(df1) * 100):,.2f}%)")
df1 = df1[~((df1['order_bump_type'] == 'ORDER_BUMP_CHILD') & (df1['subscription_feature_type'] != 'SMART_INSTALLMENT') & (df1['is_subs'] != 'subscription'))]
# check dataset shape and column names
print(f'Outliers removed: {(orig_shape-len(df1)):,} ({((orig_shape-len(df1)) / orig_shape * 100):.2f}%) ')
print(f'The dataset has {df1.shape[0]:,} rows and {df1.shape[1]} columns.')

In [None]:
### remove all order_bump_child from single-payments - smart installments only
orig_shape = len(df1)
aux0 = df1[(df1['order_bump_type'] == 'ORDER_BUMP_CHILD') & (df1['subscription_feature_type'] == 'SMART_INSTALLMENT') & (df1['is_subs'] != 'subscription')]
print(f"Number purchases {len(aux0):,} ({(len(aux0) / len(df1) * 100):,.2f}%)")
df1 = df1[~((df1['order_bump_type'] == 'ORDER_BUMP_CHILD') & (df1['subscription_feature_type'] == 'SMART_INSTALLMENT') & (df1['is_subs'] != 'subscription'))]
# check dataset shape and column names
print(f'Outliers removed: {(orig_shape-len(df1)):,} ')
print(f'The dataset has {df1.shape[0]:,} rows and {df1.shape[1]} columns.')

In [None]:
df1[df1['order_bump_type'] == 'ORDER_BUMP_CHILD'][['is_subs']].value_counts(dropna=False)

In [None]:
# df1[df1['user_buyer_id'] == 47401972][[ 'purchase_id','product_id','order_bump_type', 'purchase_parent_id', 'is_subs', 'subs_type', 'subs_status', 'purchase_order_datetime', 'subs_start_datetime','subs_cancellation_datetime', 'recurrency_number', 'subscription_id', 'recurrency_id']].sort_values('purchase_order_datetime')

In [None]:
### remove all order_bump_child from single-payments - smart installments only
orig_shape = len(df1)
aux0 = df1[(df1['order_bump_type'] == 'ORDER_BUMP_CHILD') & (df1['is_subs'] == 'subscription')]
print(f"Number purchases {len(aux0):,} ({(len(aux0) / len(df1) * 100):,.2f}%)")
df1 = df1[~((df1['order_bump_type'] == 'ORDER_BUMP_CHILD') & (df1['is_subs'] == 'subscription'))]
# check dataset shape and column names
print(f'Outliers removed: {(orig_shape-len(df1)):,} ')
print(f'The dataset has {df1.shape[0]:,} rows and {df1.shape[1]} columns.')

### Remove products with a single unique customer (excl. recurrency)

In [None]:
# get unique buyers
aux1 = df1[df1['is_subs'] == 'single-payment'][['product_id', 'product_category', 'user_buyer_id']].groupby(['product_id', 'product_category']).nunique().reset_index()
# get purchase volume
aux2 = df1[df1['is_subs'] == 'single-payment'][['product_id', 'product_category', 'purchase_id']].groupby(['product_id', 'product_category']).count().reset_index()
# merge
aux3 = aux1.merge(aux2, how = 'left', on = ['product_id', 'product_category'])
aux3.columns = ['product_id', 'product_category', 'unique_users', 'transaction_vol']
aux3['ratio'] = aux3['transaction_vol'] / aux3['unique_users'] 

In [None]:
aux3.sort_values('unique_users', ascending=True).head(10)

In [None]:
print(f"{len(aux3[aux3['unique_users'] == 1]):,.2f} products ({(len(aux3[aux3['unique_users'] == 1])/df1['product_id'].nunique() * 100):,.2f}%) with 1 unique buyer")
print(f"{aux3[aux3['unique_users'] == 1]['transaction_vol'].sum():,.2f} purchases ({(aux3[aux3['unique_users'] == 1]['transaction_vol'].sum()/len(df1) * 100):,.2f}%)")

In [None]:
orig_shape = len(df1)
aux3 = aux3[aux3['unique_users'] == 1][['product_id']].drop_duplicates()
aux3['remove_product'] = 1
df1 = df1.merge(aux3, how = 'left', on = 'product_id')
df1 = df1[~(df1['remove_product'] == 1)].drop('remove_product', axis = 1)
# check dataset shape and column names
print(f'Outliers removed: {(orig_shape-len(df1)):,} ({((orig_shape-len(df1)) / orig_shape * 100):.2f}%) ')
print(f'The dataset has {df1.shape[0]:,} rows and {df1.shape[1]} columns.')

### Users who purchase the same product (excl. recurrency) - keep entries (but inconclusive)

In [None]:
aux1 = df1[(df1['is_subs'] == 'single-payment') & (df1['subscription_feature_type'] != 'SMART_INSTALLMENT')][['user_buyer_id', 'product_id', 'purchase_id']].groupby(['user_buyer_id', 'product_id']).count().reset_index()
aux2 = aux1[aux1['purchase_id'] > 1]['user_buyer_id'].drop_duplicates()
aux3 = aux1[aux1['purchase_id'] > 1][['user_buyer_id', 'product_id']].drop_duplicates()
aux0 = df1[(df1['is_subs'] == 'single-payment') & (df1['subscription_feature_type'] != 'SMART_INSTALLMENT')]
aux0 = aux0.merge(aux3, how = 'inner', on = ['user_buyer_id', 'product_id'])
aux0['order_purchase'] = aux0.groupby(["user_buyer_id", "product_id"])["purchase_order_datetime"].rank(method="first", ascending=True)
aux0['purchase_status2'] = aux0.sort_values('purchase_order_datetime').groupby(['user_buyer_id','product_id'])['purchase_status'].shift(-1)
aux0['same_purchase_status'] = aux0.apply(lambda x: 1 if x['purchase_status2'] == x['purchase_status'] else 0, axis = 1)
aux0['double_pur_stats'] = aux0['purchase_status'] + '-' + aux0['purchase_status2']
aux0['product_id2'] = aux0.sort_values('purchase_order_datetime').groupby(['user_buyer_id', 'product_id'])['product_id'].shift(-1)
aux0['same_product'] = aux0.apply(lambda x: 1 if x['product_id2'] == x['product_id'] else 0, axis = 1)

print(f"Users who purchased the same product {len(aux2):,} ({(len(aux2) / df1['user_buyer_id'].nunique() * 100):,.2f}%)")
print(f"Number purchases {len(aux0):,} ({(len(aux0) / len(df1) * 100):,.2f}%)")

In [None]:
# df1[(df1['user_buyer_id'] == 43285158) & (df1['product_id'] == 111546)].sort_values('purchase_order_datetime')

In [None]:
# df1[(df1['user_buyer_id'] == 43285158) & (df1['product_id'] == 111546)]

In [None]:
aux0['same_product'].value_counts(dropna=False)

In [None]:
aux0[aux0['same_product'] == 0][['user_buyer_id', 'product_id', 'same_product', 'product_id2']].value_counts('product_id2', dropna=False)

In [None]:
aux0['order_purchase'].value_counts(dropna=False)

In [None]:
aux0[aux0['order_purchase'] == 22][['user_buyer_id', 'product_id', 'same_product', 'product_id2', 'order_purchase']]

In [None]:
df1[df1['user_buyer_id'] == 46834967][['user_buyer_id', 'product_id', 'purchase_order_datetime', 'gmv_value_brl', 'is_subs', 'subscription_feature_type', 'order_bump_type', 'purchase_status', 'product_category', 'is_club']]

In [None]:
aux0['purchase_status'].value_counts(dropna=False)

In [None]:
aux0['purchase_status2'].value_counts(dropna=False)

In [None]:
aux0['same_purchase_status'].value_counts(dropna=False)

In [None]:
aux0['double_pur_stats'].value_counts(dropna=False)

### Trial purchases (remove entries - inconclusive)

https://help.hotmart.com/pt-BR/article/como-configurar-o-periodo-gratis-trial-para-o-meu-produto-de-assinatura-/360038910152

In [None]:
print(f"{(len(df1[df1['subscription_feature_type'] == 'TRIAL'])):,} purchases ({(len(df1[df1['subscription_feature_type'] == 'TRIAL'])/len(df1) * 100):,.2f}%)")

In [None]:
get_descriptive_statistics(df1[df1['subscription_feature_type'] == 'TRIAL'][['gmv_value_brl']])

In [None]:
df1[(df1['user_buyer_id'] == 47563972) & (df1['product_id'].isin([1450011, 827417]))].sort_values('purchase_order_datetime')[['user_buyer_id', 'purchase_transaction', 'product_id','is_subs', 'purchase_order_datetime', 'subscription_feature_type', 'recurrency_number', 'subscription_id', 'recurrency_id','subs_start_datetime', 'subs_cancellation_datetime', 'gmv_value_brl', 'is_club', 'purchase_status']]

In [None]:
df1[df1['subscription_feature_type'] == 'TRIAL'].groupby(['user_buyer_id']).count().sort_values('purchase_id')

In [None]:
df1[df1['subscription_feature_type'] == 'TRIAL']['subs_status'].value_counts(dropna=False)

In [None]:
df1[(df1['user_buyer_id'] == 43878900)].sort_values('purchase_order_datetime')


In [None]:
df1[(df1['subscription_feature_type'] == 'TRIAL') & (df1['gmv_value_brl'] != 0)]

In [None]:
# remove users with zero gmv and not recurrency
orig_shape = len(df1)
df1 = df1[df1['subscription_feature_type'] != 'TRIAL']
# check dataset shape and column names
print(f'Outliers removed: {(orig_shape-len(df1)):,} ({((orig_shape-len(df1)) / orig_shape * 100):.2f}%) ')
print(f'The dataset has {df1.shape[0]:,} rows and {df1.shape[1]} columns.')

### Products with free signup (keep entries - inconclusive)

https://help.hotmart.com/en/article/how-to-release-free-registration-in-my-members-area-/214190177

In [None]:
# get products with free signup activated
free_sign = pd.read_csv('data/free_signup.csv', delimiter = ';')
free_sign.columns = ['product_id', 'allow']
df1 = df1.merge(free_sign, how = 'left', on = 'product_id')

In [None]:
print(f"{(df1[df1['allow'] == True]['product_id'].nunique()):,} products with free signup({((df1[df1['allow'] == True]['product_id'].nunique())/df1['product_id'].nunique() * 100):,.2f}%)")
print(f"{(len(df1[df1['allow'] == True])):,} purchases ({(len(df1[df1['allow'] == True])/len(df1) * 100):,.2f}%)")

In [None]:
df1[df1['allow'] == True]['is_club'].value_counts(dropna=False)

In [None]:
df1[df1['allow'] == True]['is_subs'].value_counts(dropna=False)

In [None]:
df1[df1['allow'] == True]['subscription_feature_type'].value_counts(dropna=False)

In [None]:
get_descriptive_statistics(df1[df1['allow'] == True][['gmv_value_brl']])

In [None]:
df1[(df1['allow'] == True) & (df1['gmv_value_brl'] == 0)]['subscription_feature_type'].value_counts(dropna=False)

In [None]:
aux11 = df1[(df1['allow'] == True )][['user_buyer_id', 'product_id', 'purchase_id', 'is_subs']].groupby(['user_buyer_id', 'product_id', 'is_subs']).count().reset_index()
aux11[aux11['purchase_id'] != 1]

In [None]:
aux11['is_subs'].value_counts()

In [None]:
df1[(df1['user_buyer_id'] == 54474596)].sort_values('purchase_order_datetime')[['gmv_value_brl', 'product_id', 'user_buyer_id', 'allow', 'purchase_order_datetime', 'is_subs']]

### products with high volume of issues (reimbursement, cbk, single purchases, tests, zero gmv)

In [None]:
# get transactions by product
aux1 = df1[['purchase_status', 'purchase_id', 'product_id']].groupby(['product_id', 
                                                               'purchase_status']).count().reset_index().pivot_table(values = 'purchase_id', 
                                                                                                                     index = 'product_id', 
                                                                                                                     columns = 'purchase_status').fillna(0)
aux1['sum'] = aux1.sum(axis=1)
# get gmv sum
aux2 = df1[['gmv_value_brl', 'product_id']].groupby('product_id').sum().reset_index()
aux1 = aux1.merge(aux2, how = 'left', on = 'product_id')
aux1['sum_gmv_ratio'] = aux1.apply(lambda x : x['gmv_value_brl'] / x['sum'] if x['gmv_value_brl'] > 0 else 0, axis = 1)
aux3 = df1[['gmv_value_brl', 'product_id']].groupby('product_id').agg(np.std).reset_index()
aux3.columns = ['product_id', 'std']
aux1 = aux1.merge(aux3, how = 'left', on = 'product_id')
get_descriptive_statistics(aux1)

In [None]:
# set validators
aux1['cbk_compl_ratio'] = aux1['CHARGEBACK'] / (aux1['COMPLETO'] + aux1['APROVADO'] + aux1['CHARGEBACK']) * 100
aux1['reim_compl_ratio'] = aux1['REEMBOLSADO'] / (aux1['COMPLETO'] + aux1['APROVADO'] + aux1['REEMBOLSADO']) * 100
# aux1['block_compl_ratio'] = aux1['BLOQUEADO'] / (aux1['COMPLETO'] + aux1['APROVADO'] + aux1['BLOQUEADO']) * 100
aux1['recl_compl_ratio'] = aux1['RECLAMADO'] / (aux1['COMPLETO'] + aux1['APROVADO'] + aux1['RECLAMADO']) * 100
aux1['exp_compl_ratio'] = aux1['EXPIRADO'] / (aux1['COMPLETO'] + aux1['APROVADO'] + aux1['EXPIRADO']) * 100
aux1['par_compl_ratio'] = aux1['PARCIALMENTE REEMBOLSADO'] / (aux1['COMPLETO'] + aux1['APROVADO'] + aux1['PARCIALMENTE REEMBOLSADO']) * 100
aux1['is_test'] = aux1['sum'].apply(lambda x : 1 if x <= 1 else 0) # if product had only X purchases
# aux1['bigger_than_complete'] = aux1.apply(lambda x : 'REEMBOLSADO' if (x['COMPLETO'] + x['APROVADO']) < x['REEMBOLSADO']
#                              else 'BLOQUEADO' if (x['COMPLETO'] + x['APROVADO']) < x['BLOQUEADO']
#                              else 'RECLAMADO' if (x['COMPLETO'] + x['APROVADO']) < x['RECLAMADO']
#                              else 'PARCIALMENTE REEMBOLSADO' if (x['COMPLETO'] + x['APROVADO']) < x['PARCIALMENTE REEMBOLSADO']
#                              else 'EXPIRADO' if (x['COMPLETO'] + x['APROVADO']) < x['EXPIRADO']
#                              else 'CHARGEBACK' if (x['COMPLETO'] + x['APROVADO']) < x['CHARGEBACK']
#                              else 'valid', axis = 1) 
# aux1['equal_sum'] = aux1.apply(lambda x : 'REEMBOLSADO' if x['sum'] == x['REEMBOLSADO']
#                              else 'BLOQUEADO' if x['sum'] == x['BLOQUEADO']
#                              else 'RECLAMADO' if x['sum'] == x['RECLAMADO']
#                              else 'PARCIALMENTE REEMBOLSADO' if x['sum'] == x['PARCIALMENTE REEMBOLSADO']
#                              else 'EXPIRADO' if x['sum'] == x['EXPIRADO']
#                              else 'CHARGEBACK' if x['sum'] == x['CHARGEBACK']
#                              else 'valid', axis = 1) # if purchase sum is equal to vol of purchases in each category

aux1['only_issues'] = aux1.apply(lambda x : 1 if x['APROVADO'] + x['COMPLETO'] == 0
                             else 0, axis = 1) 
### REMOVAL CRITERIA
aux1['remove_reason'] = aux1.apply(lambda x : 
                                  'only_issues' if x['only_issues'] == 1 # no COMPLETO or APROVADO purchases at all
#                                  else 'reimb' if ((x['reim_compl_ratio'] >= 80) & (x['reim_compl_ratio'] < 100))  # 80% of purchases are reimburse 
#                                  else 'cbk' if ((x['cbk_compl_ratio'] >= 80) & (x['cbk_compl_ratio'] < 100))  # 80% of purchases are cbk 
#                                  else 'block' if ((x['block_compl_ratio'] >= 80) & (x['block_compl_ratio'] < 100))  # 80% of purchases are block 
#                                  else 'recl' if ((x['recl_compl_ratio'] >= 80) & (x['recl_compl_ratio'] < 100))  # 80% of purchases are recl 
#                                  else 'exp' if ((x['exp_compl_ratio'] >= 80) & (x['exp_compl_ratio'] < 100))  # 80% of purchases are exp 
#                                  else 'par' if ((x['par_compl_ratio'] >= 80) & (x['par_compl_ratio'] < 100))  # 80% of purchases are par 
                                 else 'gmv_zero' if x['gmv_value_brl'] == 0 # product dont have gmv
                                 else 'single_purchase' if x['is_test'] == 1 # product only have X purchases or below
                                 else 'valid' 
                                  ,axis = 1)
aux1['remove_reason'].value_counts(dropna=False)

In [None]:
### test validators
# aux1.sort_values('BLOQUEADO', ascending=False) # valid
# aux1.sort_values('CHARGEBACK', ascending=False).head(50) # valid
# aux1.sort_values('COMPLETO', ascending=False).head(50) # valid
# aux1.sort_values('EXPIRADO', ascending=False).head(50) # valid
# aux1.sort_values('PARCIALMENTE REEMBOLSADO', ascending=False).head(50) # valid
# aux1.sort_values('RECLAMADO', ascending=False).head(50) # valid
# aux1.sort_values('REEMBOLSADO', ascending=False).head(50) # valid
# aux1.sort_values(['cbk_compl_ratio'], ascending=[False]).head(50) # valid
# aux1.sort_values(['reim_compl_ratio'], ascending=[False]).head(50) # valid
# aux1.sort_values(['sum'], ascending=[False]).head(50) # valid, keep track of outliers
# aux1[(aux1['bigger_than_complete'] == 'REEMBOLSADO') & ((aux1['reim_compl_ratio'] >= 70) &  (aux1['reim_compl_ratio'] < 100))].sort_values('REEMBOLSADO', ascending=False).head(45) # remove users
# aux1[(aux1['bigger_than_complete'] == 'CHARGEBACK') & (aux1['cbk_compl_ratio'] == 100)].sort_values('CHARGEBACK', ascending=False) # remove users
# aux1.sort_values(['sum_gmv_ratio'], ascending=False).head(50)

In [None]:
# remove users
aux2 = aux1[aux1['remove_reason'] == 'valid'][['product_id']].drop_duplicates()
orig_shape = len(df1)
df1 = df1.merge(aux2, how = 'inner', on = 'product_id')
# check dataset shape and column names
print(f'Outliers removed: {(orig_shape-len(df1)):,} ({((orig_shape-len(df1)) / orig_shape * 100):.2f}%) ')
print(f'The dataset has {df1.shape[0]:,} rows and {df1.shape[1]} columns.')

### Repurchases in which 2nd transaction was from same product and has reimbursement (excl. recurrency)

In [None]:
## get repurchases
aux3 = df1[(df1['is_subs'] == 'single-payment')][['user_buyer_id', 'product_id', 'purchase_id']].groupby(['user_buyer_id', 'product_id']).count().reset_index()
aux3 = aux3[aux3['purchase_id'] == 2] # only users who repurchased 2x same product
aux3.columns = ['user_buyer_id', 'product_id', 'count_purchases']
aux1 = df1.merge(aux3, how = 'inner', on = ['user_buyer_id', 'product_id'])
aux1['purchase_status2'] = aux1.sort_values('purchase_order_datetime').groupby(['user_buyer_id', 'product_id'])['purchase_status'].shift(-1)
aux1['purchase_id2'] = aux1.sort_values('purchase_order_datetime').groupby(['user_buyer_id', 'product_id'])['purchase_id'].shift(-1)
aux1['double_pur_stats'] = aux1['purchase_status'] + '-' + aux1['purchase_status2']
aux1['purchase_order_lag'] = aux1.sort_values('purchase_order_datetime').groupby(['user_buyer_id', 'product_id'])['purchase_order_datetime'].shift(-1)
aux1 = aux1[aux1['purchase_order_lag'].notnull()]
aux1 = aux1[aux1['double_pur_stats'] != 'COMPLETO-COMPLETO']
aux1['purchase_id_remove'] = (aux1.apply(lambda x: x['purchase_id'] if (x['purchase_status'] != 'COMPLETO') & (x['purchase_status2'] == 'COMPLETO')
                            else x['purchase_id2'] if (x['purchase_status'] == 'COMPLETO') & (x['purchase_status2'] != 'COMPLETO')
                            else x['purchase_id'] if (x['purchase_status'] != 'COMPLETO') & (x['purchase_status2'] != 'COMPLETO')
                            else 0, axis = 1)).astype(int)


In [None]:
aux1['double_pur_stats'].value_counts(dropna=False)

In [None]:
df1['purchase_status'].value_counts(dropna=False, normalize=True) * 100

In [None]:
orig_shape = len(df1)
aux2 = aux1[['purchase_id_remove']].drop_duplicates()
aux2.columns = ['purchase_id']
aux2['remove_transaction'] = 1
df1 = df1.merge(aux2, how = 'left', on = 'purchase_id')
df1 = df1[~(df1['remove_transaction'] == 1)].drop('remove_transaction', axis = 1)
# check dataset shape and column names
print(f'Outliers removed: {(orig_shape-len(df1)):,} ({((orig_shape-len(df1)) / orig_shape * 100):.2f}%) ')
print(f'The dataset has {df1.shape[0]:,} rows and {df1.shape[1]} columns.')

### Repurchase whose time to repurchase was zero

In [None]:
df1['user_buyer_id'].nunique()

In [None]:
### get df
aux1 = df1.copy()
aux1['purchase_order_lag'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_order_datetime'].shift(-1)
aux1['purchase_id2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
aux1['order_purchase'] = aux1.groupby("user_buyer_id")["purchase_order_datetime"].rank(method="first", ascending=True)
aux1['order_release'] = aux1.groupby("user_buyer_id")["purchase_release_datetime"].rank(method="first", ascending=True)
aux1['purchase_status2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_status'].shift(-1)
aux1['is_club2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['is_club'].shift(-1)
aux1['is_subs2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['is_subs'].shift(-1)
aux1['purchase_payment_type2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_payment_type'].shift(-1)
aux1['purchase_sale_type2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_sale_type'].shift(-1)
aux1['payment_method_description2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['payment_method_description'].shift(-1)
aux1['product_id2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['product_id'].shift(-1)
aux1['producer_id2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['producer_id'].shift(-1)
aux1['same_product'] = aux1.apply(lambda x: 1 if x['product_id2'] == x['product_id'] else 0, axis = 1)
aux1['same_purchase_status'] = aux1.apply(lambda x: 1 if x['purchase_status2'] == x['purchase_status'] else 0, axis = 1)
aux1['double_pur_stats'] = aux1['purchase_status'] + '-' + aux1['purchase_status2']
aux1['double_subs'] = aux1['is_subs'] + '-' + aux1['is_subs2']
aux1['double_club'] = aux1['is_club'] + '-' + aux1['is_club2']
# calc delta
aux2 = aux1[aux1['purchase_order_lag'].notnull()]
aux2['delta_days'] = (aux2['purchase_order_lag'] - aux2['purchase_order_datetime']).dt.total_seconds() / 60 / 60 /24
aux2['delta_sec'] = (aux2['purchase_order_lag'] - aux2['purchase_order_datetime']).dt.total_seconds() 
pd.concat([get_descriptive_statistics(aux2[['delta_days']]), get_descriptive_statistics(aux2[['delta_sec']])])

In [None]:
minutes = 1
sns.set_style('white')
sns.histplot(data = aux2[aux2['delta_sec'] < 60 * minutes], x = 'delta_sec', 
            kde = True, color = 'darkcyan'
            )
plt.show()

In [None]:
aux4 = aux2[aux2['delta_sec'] == 0]

In [None]:
aux4['purchase_status'].value_counts(dropna=False)

In [None]:
aux4['user_type'].value_counts(dropna=False)

In [None]:
aux4['double_pur_stats'].value_counts(dropna=False)

In [None]:
aux4['double_subs'].value_counts(dropna=False)

In [None]:
aux4['double_club'].value_counts(dropna=False)

In [None]:
aux4['same_product'].value_counts(dropna=False)

In [None]:
aux4['product_category'].value_counts(dropna=False)

In [None]:
aux4['user_type'].value_counts(dropna=False)

In [None]:
aux4['is_subs'].value_counts(dropna=False)

In [None]:
aux4['is_club'].value_counts(dropna=False)

In [None]:
df1[df1['user_buyer_id'] == 52650803 ].sort_values('purchase_order_datetime')

In [None]:
# aux4[['purchase_id', 'purchase_id2', 'user_buyer_id', 'user_type', 'product_id', 'product_id2','same_product', 'order_purchase', 'order_release','purchase_status', 'purchase_status2','double_pur_stats', 'purchase_order_datetime','purchase_order_lag', 'delta_days', 'delta_sec']]

In [None]:
aux6 = df1.merge(aux4[['user_buyer_id']].astype(int).drop_duplicates(), how = 'inner', on = 'user_buyer_id')
print(f"{len(aux6):,} transactions {(len(aux6) / len(df1) * 100):.2f}% would be removed if user_buyer_id becomes the main filter")

In [None]:
orig_shape = len(df1)
aux6 = df1.merge(aux4[['user_buyer_id']].astype(int).drop_duplicates(), how = 'inner', on = 'user_buyer_id')[['purchase_id']]
aux6['remove_first'] = 1
df1 = df1.merge(aux6, how = 'left', on = 'purchase_id')
df1 = df1[~(df1['remove_first'] == 1)].drop('remove_first', axis = 1)
# check dataset shape and column names
print(f'Outliers removed: {(orig_shape-len(df1)):,} ({((orig_shape-len(df1)) / orig_shape * 100):.2f}%) ')
print(f'The dataset has {df1.shape[0]:,} rows and {df1.shape[1]} columns.')

### Purchases whose time to repurchase was very low

In [None]:
### get df
aux1 = df1.copy()
aux1['purchase_order_lag'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_order_datetime'].shift(-1)
aux1['purchase_id2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
aux1['order_purchase'] = aux1.groupby("user_buyer_id")["purchase_order_datetime"].rank(method="first", ascending=True)
aux1['order_release'] = aux1.groupby("user_buyer_id")["purchase_release_datetime"].rank(method="first", ascending=True)
aux1['purchase_status2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_status'].shift(-1)
aux1['purchase_payment_type2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_payment_type'].shift(-1)
aux1['purchase_sale_type2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_sale_type'].shift(-1)
aux1['payment_method_description2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['payment_method_description'].shift(-1)
aux1['product_id2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['product_id'].shift(-1)
aux1['producer_id2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['producer_id'].shift(-1)
aux1['device2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['device'].shift(-1)
aux1['allow2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['allow'].shift(-1)
aux1['purchase_sale_type2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_sale_type'].shift(-1)
aux1['same_product'] = aux1.apply(lambda x: 1 if x['product_id2'] == x['product_id'] else 0, axis = 1)
aux1['same_purchase_status'] = aux1.apply(lambda x: 1 if x['purchase_status2'] == x['purchase_status'] else 0, axis = 1)
aux1['double_pur_stats'] = aux1['purchase_status'] + '-' + aux1['purchase_status2']
aux1['double_device'] = aux1['device'] + '-' + aux1['device2']
aux1['double_purchase_sale_type'] = aux1['purchase_sale_type'] + '-' + aux1['purchase_sale_type2']
# calc delta
aux2 = aux1[aux1['purchase_order_lag'].notnull()]
aux2['delta_days'] = (aux2['purchase_order_lag'] - aux2['purchase_order_datetime']).dt.total_seconds() / 60 / 60 /24
aux2['delta_sec'] = (aux2['purchase_order_lag'] - aux2['purchase_order_datetime']).dt.total_seconds() 
pd.concat([get_descriptive_statistics(aux2[['delta_days']]), get_descriptive_statistics(aux2[['delta_sec']])])

In [None]:
days = 1
sns.set_style('white')
sns.histplot(data = aux2[aux2['delta_days'] < days ], x = 'delta_sec', 
            kde = True, color = 'darkcyan'
            )
plt.show()

In [None]:
# get 1% percentile
# threshold = np.percentile(aux2['delta_days'], 1)
aux4 = aux2[aux2['delta_days'] <= 1]
# np.percentile(aux2['delta_sec'], 1)

In [None]:
aux4['allow'].value_counts(dropna=False)

In [None]:
aux4['order_bump_type'].value_counts(dropna=False)

In [None]:
aux4['purchase_status'].value_counts(dropna=False)

In [None]:
aux4['user_type'].value_counts(dropna=False)

In [None]:
aux4['double_pur_stats'].value_counts(dropna=False)

In [None]:
aux4['same_product'].value_counts(dropna=False)

In [None]:
aux4['device'].value_counts(dropna=False)

In [None]:
aux4['double_device'].value_counts(dropna=False)

In [None]:
aux4['double_purchase_sale_type'].value_counts(dropna=False)

In [None]:
aux4['purchase_sale_type'].value_counts(dropna=False)

In [None]:
aux4['purchase_sale_type2'].value_counts(dropna=False)

In [None]:
aux4['user_type'].value_counts(dropna=False)

In [None]:
aux4['product_category'].value_counts(dropna=False)

In [None]:
aux4[['purchase_id', 'purchase_id2', 'user_buyer_id', 'user_type', 'product_id', 'product_id2','same_product', 'order_purchase', 'order_release','purchase_status', 'purchase_status2','double_pur_stats', 'purchase_order_datetime','purchase_order_lag', 'delta_days', 'delta_sec']]

In [None]:
orig_shape = len(df1)
aux6 = aux4[['purchase_id2']].astype(int).drop_duplicates()
aux6.columns = ['purchase_id']
aux6['remove_first'] = 1
df1 = df1.merge(aux6, how = 'left', on = 'purchase_id')
df1 = df1[~(df1['remove_first'] == 1)].drop('remove_first', axis = 1)
# check dataset shape and column names
print(f'Outliers removed: {(orig_shape-len(df1)):,} ({((orig_shape-len(df1)) / orig_shape * 100):.2f}%) ')
print(f'The dataset has {df1.shape[0]:,} rows and {df1.shape[1]} columns.')

In [None]:
### get df
aux1 = df1.copy()
aux1['purchase_order_lag'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_order_datetime'].shift(-1)
# calc delta
aux2 = aux1[aux1['purchase_order_lag'].notnull()]
aux2['delta_days'] = (aux2['purchase_order_lag'] - aux2['purchase_order_datetime']).dt.total_seconds() / 60 / 60 /24
aux2['delta_sec'] = (aux2['purchase_order_lag'] - aux2['purchase_order_datetime']).dt.total_seconds() 
pd.concat([get_descriptive_statistics(aux2[['delta_days']]), get_descriptive_statistics(aux2[['delta_sec']])])

In [None]:
minutes = 2
sns.set_style('white')
sns.histplot(data = aux2[aux2['delta_days'] < 30], x = 'delta_sec', 
            kde = True, color = 'darkcyan'
            )
plt.show()

### products with too high std and low purchase volume (keep - inconclusive)

In [None]:
# get transactions by product
aux1 = df1[['purchase_status', 'purchase_id', 'product_id']].groupby(['product_id', 
                                                               'purchase_status']).count().reset_index().pivot_table(values = 'purchase_id', 
                                                                                                                     index = 'product_id', 
                                                                                                                     columns = 'purchase_status').fillna(0)
aux1['sum'] = aux1.sum(axis=1)
# get gmv sum
aux2 = df1[['gmv_value_brl', 'product_id']].groupby('product_id').sum().reset_index()
aux1 = aux1.merge(aux2, how = 'left', on = 'product_id')
aux1['sum_gmv_ratio'] = aux1.apply(lambda x : x['gmv_value_brl'] / x['sum'] if x['gmv_value_brl'] > 0 else 0, axis = 1)
aux3 = df1[['gmv_value_brl', 'product_id']].groupby('product_id').agg(np.std).reset_index()
aux3.columns = ['product_id', 'std']
aux4 = df1[['gmv_value_brl', 'product_id']].groupby('product_id').agg([('min' , 'min'), ('max', 'max')]).reset_index()
aux4.columns = ['product_id', 'min', 'max']
aux4['range'] = aux4['max'] - aux4['min']
aux1 = aux1.merge(aux3, how = 'left', on = 'product_id')
aux1 = aux1.merge(aux4, how = 'left', on = 'product_id')[['product_id', 'std', 'sum', 'max', 'min', 'range']]
df1 = df1.merge(aux1, how = 'left', on = 'product_id')
# check dataset shape and column names
print(f'The dataset has {df1.shape[0]:,} rows and {df1.shape[1]} columns.')

In [None]:
pd.concat([get_descriptive_statistics(df1[df1['product_id'].duplicated(keep='first')][['std']]), get_descriptive_statistics(df1[df1['product_id'].duplicated(keep='first')][['sum']]),  get_descriptive_statistics(df1[df1['product_id'].duplicated(keep='first')][['min']])])

In [None]:
 
sum1 = np.percentile(df1[df1['product_id'].duplicated(keep='first')][['sum']], 1)
std99 = np.percentile(df1[df1['product_id'].duplicated(keep='first')][['std']], 95)
min1 = np.percentile(df1[df1['product_id'].duplicated(keep='first')][['min']], 75)


In [None]:
print(f"Remove transactions with std above 95th percentile {std99:,.2f}, purchase vol below 1st percentile {sum1:,.2f}, and min gmv below 75th percentile {min1:,.4f}")

In [None]:
# df1[(df1['std'] >= std99) & (df1['sum'] <= sum1) & (df1['min'] <= min1)].sort_values(['std'], ascending=[False])[['product_id', 'purchase_id', 'user_buyer_id', 'gmv_value_brl', 'std', 'sum', 'max', 'min', 'range']].tail(50)

In [None]:
# orig_shape = len(df1)
# df1 = df1[~((df1['std'] >= std99) & (df1['sum'] <= sum1) & (df1['min'] <= min1))]
# df1 = df1.drop(['std', 'sum', 'max', 'min', 'range'] , axis = 1)
# # check dataset shape and column names
# print(f'Outliers removed: {(orig_shape-len(df1)):,} ')
# print(f'The dataset has {df1.shape[0]:,} rows and {df1.shape[1]} columns.')

### failed products - low GMV and transaction volume (keep - inconclusive)

In [None]:
# get transactions by product
aux1 = df1[['purchase_status', 'purchase_id', 'product_id']].groupby(['product_id', 
                                                               'purchase_status']).count().reset_index().pivot_table(values = 'purchase_id', 
                                                                                                                     index = 'product_id', 
                                                                                                                     columns = 'purchase_status').fillna(0)
aux1['sum'] = aux1.sum(axis=1)
# get gmv sum
aux2 = df1[['gmv_value_brl', 'product_id']].groupby('product_id').sum().reset_index()
aux1 = aux1.merge(aux2, how = 'left', on = 'product_id')
aux1['sum_gmv_ratio'] = aux1.apply(lambda x : x['gmv_value_brl'] / x['sum'] if x['gmv_value_brl'] > 0 else 0, axis = 1)
aux3 = df1[['gmv_value_brl', 'product_id']].groupby('product_id').agg(np.std).reset_index()
aux3.columns = ['product_id', 'std']
aux1 = aux1.merge(aux3, how = 'left', on = 'product_id')
aux1['ticket'] = aux1['gmv_value_brl'] / aux1['sum'] 
get_descriptive_statistics(aux1)

In [None]:
sum75 = np.percentile(aux1['sum'], 75)
gmv10 = np.percentile(aux1['gmv_value_brl'], 10)

In [None]:
print(f"Remove products with transactions below {sum75:,.2f}, gmv below {gmv10:,.2f}")

In [None]:
# aux1.sort_values('ticket').head(50)

In [None]:
 
aux1['remove'] = aux1.apply(lambda x : 1 if ((x['sum'] <= sum75) & (x['gmv_value_brl'] <= gmv10)) 
                            else 1 if x['ticket'] <= 0.1761 
                            else 0,                      
                            axis = 1)

In [None]:
# aux1.sort_values(['ticket', 'gmv_value_brl'], ascending = [True, False]).head(50)

In [None]:
##### ! products with purchase volume less than 75th percentile and generated gmv below 10th percentile mark
aux1[(aux1['sum'] <= sum75) & (aux1['gmv_value_brl'] <= gmv10)]

In [None]:
aux1.value_counts('remove')

In [None]:
# orig_shape = len(df1)
# df1 = df1.merge(aux1[aux1['remove'] == 0][['product_id']], how = 'inner', on = 'product_id')
# # check dataset shape and column names
# print(f'Outliers removed: {(orig_shape-len(df1)):,} ')
# print(f'The dataset has {df1.shape[0]:,} rows and {df1.shape[1]} columns.')

# STEP 02 - FEATURE ENGINEERING

In [None]:
# print(f"Total removed rows: {(count_df - len(df1)):,} {((count_df - len(df1)) / len(df1) * 100 ):.2f}%")

# # copy dataset
# df2 = df1.copy()
# df2.to_pickle('data/df2.pkl')
# read pickle file as dataframe
df2 = pd.read_pickle('data/df2.pkl')

In [None]:
# check dtypes
df2.dtypes

In [None]:
df2.category.unique()

In [None]:
# df2 = df2.drop(['std', 'sum', 'max',
#        'min', 'range'], axis = 1)

In [None]:
### count purchases per user
aux1 = df2[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
aux1.columns = ['user_buyer_id', 'no_repurchase']
df2 = df2.merge(aux1, how = 'left', on = 'user_buyer_id')

### flag repurchase
df2['repurchase'] = df2['no_repurchase'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')

### flag whether a user repurchased from same producer at least once 
aux1 = df2[df2['repurchase'] == 'repurchase'][['purchase_id', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).count().reset_index().reset_index()
aux1['same_prod'] = aux1['purchase_id'].apply(lambda x : 'repurchased' if x > 1 else 'did not repurchase')
# get users who repurchased from same producer
aux2 = aux1[aux1['same_prod'] == 'repurchased'][['user_buyer_id']].drop_duplicates()
aux2['repurchase_same_producer'] = 'same_producer_repurchase'
# get users who did not repurchased
df2 = df2.merge(aux2, how = 'left', on = 'user_buyer_id')
df2.loc[df2['repurchase'] == 'single-purchase', 'repurchase_same_producer'] = 'no_repurchase'
df2['repurchase_same_producer'] = df2['repurchase_same_producer'].fillna('diff_producer_repurchase')
# get rank of purchases
df2['order_purchase'] = df2.groupby("user_buyer_id")["purchase_order_datetime"].rank(method="first", ascending=True)
df2['order_release'] = df2.groupby("user_buyer_id")["purchase_release_datetime"].rank(method="first", ascending=True)

# flag producers with a single product
aux1 = df2[['product_id', 'producer_id']].groupby('producer_id').nunique().reset_index()
aux1.columns = ['producer_id', 'num_products_producer']
df2 = df2.merge(aux1, how = 'left', on = 'producer_id')

# get year-month purchase
df2['year_month'] = df2['purchase_order_datetime'].astype(str).str[0:7]

In [None]:
# new category ENG, SPARKLE removed
df2['category_cleaned'] = df2['category'].map({"ANIMAIS E  PETS": "Pets", 
                                                 "AUTOCONHECIMENTO E ESPIRITUALIDADE": "Self-awareness & Spirit",
                                                "MODA E BELEZA":"Fashion & Beauty",
                                                "SAÚDE E ESPORTES":"Health & Sports",
                                                 "RELACIONAMENTOS": "Relationships",
                                                 "CULINÁRIA E GASTRONOMIA":"Gastronomy",
                                                 "HOBBIES E LAZER":"Hobbies & Leisure",
                                                 "CARREIRA E DESENVOLVIMENTO PESSOAL":"Career & Personal Dev.",
                                                 "ENSINO E ESTUDO ACADÊMICO":"Acad. Learning & Teaching",
                                                 "DESIGN E FOTOGRAFIA":"Design & Photo",
                                                 "MÚSICA E ARTES":"Music & Arts",
                                                 "EDUCAÇÃO INFANTIL E FAMÍLIA":"Child Ed. & Family",
                                                 "PLANTAS E ECOLOGIA":"Plants & Ecology",
                                                 "FINANÇAS E NEGÓCIOS": "Business & Finance",
                                                 "ENGENHARIA E ARQUITETURA": "Eng & Arch",
                                                 "MANUTENÇÃO DE EQUIPAMENTOS":"Equipment Maintenance",
                                                 "MARKETING E VENDAS": "MKT & Sales",
                                                 "TECNOLOGIA E DESENVOLVIMENTO DE SOFTWARE":"Tech & Software Dev."
                                            })

# STEP 03 - EXPLORATORY DATA ANALYSIS

In [None]:
# ### copy dataset
# df3 = df2.copy()
# prod = pd.read_csv('data/producers_ltv.csv', delimiter=';')
# prod.columns = ['producer_id', 'gmv_brl_exact', 'gmv_brl_entire', 'gmv_2021', 'gmv_2022']

# # get gmv data
# aux1 = df3[['producer_id']].drop_duplicates().merge(prod, how = 'left', on = 'producer_id')
# aux1['gmv_2021'] = aux1['gmv_2021'].fillna(0) 
# aux1['gmv_2022'] = aux1['gmv_2022'].fillna(0)
# aux1['mean_gmv_ltv'] = (aux1['gmv_2021'] + aux1['gmv_2022']) / 2
# aux1['is_below_10_exact'] = aux1['gmv_brl_exact'].apply(lambda x : 'above' if x >= 10000 else 'below') ## gmv made during the study period
# aux1['is_below_10_entire'] = aux1['gmv_brl_entire'].apply(lambda x : 'above' if x >= 10000 else 'below')  ## gmv done during entire lifetime
# aux1['is_below_10_mean'] = aux1['mean_gmv_ltv'].apply(lambda x : 'above' if x >= 10000 else 'below') ## mean gmv made in 21/22
# # merge
# df3 = df3.merge(aux1, on = 'producer_id', how = 'left')

# df3.to_pickle('data/df3.pkl')

# # read pickle file as dataframe
df3 = pd.read_pickle('data/df3.pkl')

In [None]:
df3['purchase_order_datetime'].min()

In [None]:
df3['purchase_order_datetime'].max()

In [None]:
## remove spines
# splot.spines['right'].set_visible(False)
# splot.spines['top'].set_visible(False)
# splot.spines['left'].set_visible(False)
# splot.spines['bottom'].set_visible(False)
# splot.axes.get_xaxis().set_visible(False)
# splot.axes.get_yaxis().set_visible(False)
# plt.yticks(fontsize =16)  

## Univariates (Sanity Checks)

In [None]:
print(f"Users: {df3['user_buyer_id'].nunique():,}")  
print(f"Purchases: {df3['purchase_id'].nunique():,}")  
print(f"Products: {df3['product_id'].nunique():,}")  
print(f"Producers: {df3['producer_id'].nunique():,}")  
print(f"MKT CATEGORIES: {df3['category'].nunique():,}")  

### Customer Country (TOP 10): Num. of Customers

In [None]:
# get top 10 countries 
top10 = list(df3[['user_country', 'purchase_id']].groupby('user_country').count().reset_index().sort_values('purchase_id', ascending=False)['user_country'][0:10])
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[['user_country', 'user_buyer_id']].groupby('user_country').nunique().reset_index()
aux1['%'] = aux1['user_buyer_id'] / aux1['user_buyer_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)[0:10]
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'user_country', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['user_country']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f" Users: {(aux1['user_buyer_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

### Customer Country (TOP 10): Num of transactions (based on customer country)

In [None]:
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[['user_country', 'purchase_id']].groupby('user_country').nunique().reset_index()
aux1['%'] = aux1['purchase_id'] / aux1['purchase_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)[0:10]
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'user_country', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['user_country']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f" Transactions: {(aux1['purchase_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

### Producer Office: Num of Customers

In [None]:
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[['user_office_name', 'user_buyer_id']].groupby('user_office_name').nunique().reset_index()
aux1['%'] = aux1['user_buyer_id'] / aux1['user_buyer_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'user_office_name', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['user_office_name']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f" Customers: {(aux1['user_buyer_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

In [None]:
0.0059 * 6636573 * 0.086

### Producer Office: Num of producers

In [None]:
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[['user_office_name', 'producer_id']].groupby('user_office_name').nunique().reset_index()
aux1['%'] = aux1['producer_id'] / aux1['producer_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'user_office_name', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['user_office_name']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f" Num Producers: {(aux1['producer_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

### Producer Office: Num of transactions

In [None]:
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[['user_office_name', 'purchase_id']].groupby('user_office_name').nunique().reset_index()
aux1['%'] = aux1['purchase_id'] / aux1['purchase_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'user_office_name', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['user_office_name']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f" Transaction Volume: {(aux1['purchase_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

### Product Category: Num of transactions

In [None]:
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[['category_cleaned', 'purchase_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['purchase_id'] / aux1['purchase_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f" Num transactions: {(aux1['purchase_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

### Product Category: Num of transactions, by office

In [None]:
country = 'BRAZIL'
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[df3['user_office_name'] == country][['category_cleaned', 'purchase_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['purchase_id'] / aux1['purchase_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{country} |  Num transactions: {(aux1['purchase_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

In [None]:
country = 'COLOMBIA'
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[df3['user_office_name'] == country][['category_cleaned', 'purchase_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['purchase_id'] / aux1['purchase_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{country} |  Num transactions: {(aux1['purchase_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

In [None]:
country = 'MEXICO'
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[df3['user_office_name'] == country][['category_cleaned', 'purchase_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['purchase_id'] / aux1['purchase_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{country} |  Num transactions: {(aux1['purchase_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

In [None]:
country = 'SPAIN'
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[df3['user_office_name'] == country][['category_cleaned', 'purchase_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['purchase_id'] / aux1['purchase_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{country} |  Num transactions: {(aux1['purchase_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

In [None]:
country = 'USA'
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[df3['user_office_name'] == country][['category_cleaned', 'purchase_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['purchase_id'] / aux1['purchase_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{country} |  Num transactions: {(aux1['purchase_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

In [None]:
country = 'AMSTERDAM'
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[df3['user_office_name'] == country][['category_cleaned', 'purchase_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['purchase_id'] / aux1['purchase_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{country} |  Num transactions: {(aux1['purchase_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

### Product Category: Num of transactions, by segment

In [None]:
segment = 'SEED'
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[df3['segment'] == segment][['category_cleaned', 'purchase_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['purchase_id'] / aux1['purchase_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{segment} |  Num transactions: {(aux1['purchase_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

In [None]:
segment = 'SMALL'
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[df3['segment'] == segment][['category_cleaned', 'purchase_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['purchase_id'] / aux1['purchase_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{segment} |  Num transactions: {(aux1['purchase_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

In [None]:
segment = 'MEDIUM'
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[df3['segment'] == segment][['category_cleaned', 'purchase_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['purchase_id'] / aux1['purchase_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{segment} |  Num transactions: {(aux1['purchase_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

In [None]:
segment = 'LARGE'
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[df3['segment'] == segment][['category_cleaned', 'purchase_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['purchase_id'] / aux1['purchase_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{segment} |  Num transactions: {(aux1['purchase_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

### Product Category: Num of customers

In [None]:
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[['category_cleaned', 'user_buyer_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['user_buyer_id'] / aux1['user_buyer_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f" Num buyers: {(aux1['user_buyer_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

In [None]:
7303410*0.0025

### Product Category: Num of products

In [None]:
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[['category_cleaned', 'product_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['product_id'] / aux1['product_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f" Num Products: {(aux1['product_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

### Product Category: Num of products, by office

In [None]:
country = 'BRAZIL'
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[df3['user_office_name'] == country][['category_cleaned', 'product_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['product_id'] / aux1['product_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{country} | Num Products: {(aux1['product_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

In [None]:
country = 'COLOMBIA'
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[df3['user_office_name'] == country][['category_cleaned', 'product_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['product_id'] / aux1['product_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{country} | Num Products: {(aux1['product_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

In [None]:
country = 'MEXICO'
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[df3['user_office_name'] == country][['category_cleaned', 'product_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['product_id'] / aux1['product_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{country} | Num Products: {(aux1['product_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

In [None]:
country = 'SPAIN'
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[df3['user_office_name'] == country][['category_cleaned', 'product_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['product_id'] / aux1['product_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{country} | Num Products: {(aux1['product_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

In [None]:
country = 'USA'
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[df3['user_office_name'] == country][['category_cleaned', 'product_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['product_id'] / aux1['product_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{country} | Num Products: {(aux1['product_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

In [None]:
country = 'AMSTERDAM'
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[df3['user_office_name'] == country][['category_cleaned', 'product_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['product_id'] / aux1['product_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{country} | Num Products: {(aux1['product_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

### Product Category: Num of unique producers serving a category, by office

In [None]:
country = 'BRAZIL'
# get top 10 countries 
plt.figure(figsize = (20,20))
# get vol of users with purchase
aux1 = df3[df3['user_office_name'] == country]
aux1 = aux1[['category_cleaned', 'producer_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['producer_id'] / aux1['producer_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'producer_id', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{country} | Total of unique producers (with sales) serving each category: {(aux1['producer_id'].sum()):,}")
plt.yticks(fontsize =20)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 15000)
plt.show()

In [None]:
country = 'COLOMBIA'
# get top 10 countries 
plt.figure(figsize = (20,20))
# get vol of users with purchase
aux1 = df3[df3['user_office_name'] == country]
aux1 = aux1[['category_cleaned', 'producer_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['producer_id'] / aux1['producer_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'producer_id', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{country} | Total of unique producers (with sales) serving each category: {(aux1['producer_id'].sum()):,}")
plt.yticks(fontsize =20)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 15000)
plt.show()

In [None]:
country = 'MEXICO'
# get top 10 countries 
plt.figure(figsize = (20,20))
# get vol of users with purchase
aux1 = df3[df3['user_office_name'] == country]
aux1 = aux1[['category_cleaned', 'producer_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['producer_id'] / aux1['producer_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'producer_id', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{country} | Total of unique producers (with sales) serving each category: {(aux1['producer_id'].sum()):,}")
plt.yticks(fontsize =20)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 15000)
plt.show()

In [None]:
country = 'SPAIN'
# get top 10 countries 
plt.figure(figsize = (20,20))
# get vol of users with purchase
aux1 = df3[df3['user_office_name'] == country]
aux1 = aux1[['category_cleaned', 'producer_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['producer_id'] / aux1['producer_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'producer_id', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{country} | Total of unique producers (with sales) serving each category: {(aux1['producer_id'].sum()):,}")
plt.yticks(fontsize =20)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 15000)
plt.show()

In [None]:
country = 'USA'
# get top 10 countries 
plt.figure(figsize = (20,20))
# get vol of users with purchase
aux1 = df3[df3['user_office_name'] == country]
aux1 = aux1[['category_cleaned', 'producer_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['producer_id'] / aux1['producer_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'producer_id', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{country} | Total of unique producers (with sales) serving each category: {(aux1['producer_id'].sum()):,}")
plt.yticks(fontsize =20)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 15000)
plt.show()

In [None]:
country = 'AMSTERDAM'
# get top 10 countries 
plt.figure(figsize = (20,20))
# get vol of users with purchase
aux1 = df3[df3['user_office_name'] == country]
aux1 = aux1[['category_cleaned', 'producer_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['producer_id'] / aux1['producer_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'producer_id', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{country} | Total of unique producers (with sales) serving each category: {(aux1['producer_id'].sum()):,}")
plt.yticks(fontsize =20)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 15000)
plt.show()

### Product Category: Num of producers

In [None]:
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[['category_cleaned', 'producer_id']].groupby('category_cleaned').nunique().reset_index()
aux1['%'] = aux1['producer_id'] / aux1['producer_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f" Num producers: {(aux1['producer_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

### Product Category: GMV

In [None]:
aux1 = df3[['category_cleaned', 'gmv_value_brl']].groupby('category_cleaned').sum().reset_index()
aux1['%'] = aux1['gmv_value_brl'] / aux1['gmv_value_brl'].sum() * 100
plt.figure(figsize = (15,10))
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"Product Category: GMV BRL (%)")
plt.yticks(fontsize =16)
plt.xlabel("% GMV")
plt.ylabel("")
plt.xlim(0, 50)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(wspace=0.2)
plt.show()

### Product Category: GMV, by segment

In [None]:
segment = 'SEED'
aux1 = df3[df3['segment'] == segment][['category_cleaned', 'gmv_value_brl']].groupby('category_cleaned').sum().reset_index()
aux1['%'] = aux1['gmv_value_brl'] / aux1['gmv_value_brl'].sum() * 100
plt.figure(figsize = (15,10))
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{segment} | Product Category: GMV BRL (%)")
plt.yticks(fontsize =16)
plt.xlabel("% GMV")
plt.ylabel("")
plt.xlim(0, 50)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(wspace=0.2)
plt.show()

In [None]:
segment = 'SMALL'
aux1 = df3[df3['segment'] == segment][['category_cleaned', 'gmv_value_brl']].groupby('category_cleaned').sum().reset_index()
aux1['%'] = aux1['gmv_value_brl'] / aux1['gmv_value_brl'].sum() * 100
plt.figure(figsize = (15,10))
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{segment} | Product Category: GMV BRL (%)")
plt.yticks(fontsize =16)
plt.xlabel("% GMV")
plt.ylabel("")
plt.xlim(0, 50)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(wspace=0.2)
plt.show()

In [None]:
segment = 'MEDIUM'
aux1 = df3[df3['segment'] == segment][['category_cleaned', 'gmv_value_brl']].groupby('category_cleaned').sum().reset_index()
aux1['%'] = aux1['gmv_value_brl'] / aux1['gmv_value_brl'].sum() * 100
plt.figure(figsize = (15,10))
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{segment} | Product Category: GMV BRL (%)")
plt.yticks(fontsize =16)
plt.xlabel("% GMV")
plt.ylabel("")
plt.xlim(0, 50)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(wspace=0.2)
plt.show()

In [None]:
segment = 'LARGE'
aux1 = df3[df3['segment'] == segment][['category_cleaned', 'gmv_value_brl']].groupby('category_cleaned').sum().reset_index()
aux1['%'] = aux1['gmv_value_brl'] / aux1['gmv_value_brl'].sum() * 100
plt.figure(figsize = (15,10))
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{segment} | Product Category: GMV BRL (%)")
plt.yticks(fontsize =16)
plt.xlabel("% GMV")
plt.ylabel("")
plt.xlim(0, 50)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(wspace=0.2)
plt.show()

### Product Category: GMV, by office

In [None]:
country = 'BRAZIL'
aux1 = df3[df3['user_office_name'] == country][['category_cleaned', 'gmv_value_brl']].groupby('category_cleaned').sum().reset_index()
aux1['%'] = aux1['gmv_value_brl'] / aux1['gmv_value_brl'].sum() * 100
plt.figure(figsize = (15,10))
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{country} | Product Category: GMV BRL (%)")
plt.yticks(fontsize =16)
plt.xlabel("% GMV")
plt.ylabel("")
plt.xlim(0, 50)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(wspace=0.2)
plt.show()

In [None]:
country = 'COLOMBIA'
aux1 = df3[df3['user_office_name'] == country][['category_cleaned', 'gmv_value_brl']].groupby('category_cleaned').sum().reset_index()
aux1['%'] = aux1['gmv_value_brl'] / aux1['gmv_value_brl'].sum() * 100
plt.figure(figsize = (15,10))
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{country} | Product Category: GMV BRL (%)")
plt.yticks(fontsize =16)
plt.xlabel("% GMV")
plt.ylabel("")
plt.xlim(0, 50)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(wspace=0.2)
plt.show()

In [None]:
country = 'MEXICO'
aux1 = df3[df3['user_office_name'] == country][['category_cleaned', 'gmv_value_brl']].groupby('category_cleaned').sum().reset_index()
aux1['%'] = aux1['gmv_value_brl'] / aux1['gmv_value_brl'].sum() * 100
plt.figure(figsize = (15,10))
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{country} | Product Category: GMV BRL (%)")
plt.yticks(fontsize =16)
plt.xlabel("% GMV")
plt.ylabel("")
plt.xlim(0, 50)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(wspace=0.2)
plt.show()

In [None]:
country = 'SPAIN'
aux1 = df3[df3['user_office_name'] == country][['category_cleaned', 'gmv_value_brl']].groupby('category_cleaned').sum().reset_index()
aux1['%'] = aux1['gmv_value_brl'] / aux1['gmv_value_brl'].sum() * 100
plt.figure(figsize = (15,10))
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{country} | Product Category: GMV BRL (%)")
plt.yticks(fontsize =16)
plt.xlabel("% GMV")
plt.ylabel("")
plt.xlim(0, 50)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(wspace=0.2)
plt.show()

In [None]:
country = 'USA'
aux1 = df3[df3['user_office_name'] == country][['category_cleaned', 'gmv_value_brl']].groupby('category_cleaned').sum().reset_index()
aux1['%'] = aux1['gmv_value_brl'] / aux1['gmv_value_brl'].sum() * 100
plt.figure(figsize = (15,10))
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{country} | Product Category: GMV BRL (%)")
plt.yticks(fontsize =16)
plt.xlabel("% GMV")
plt.ylabel("")
plt.xlim(0, 50)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(wspace=0.2)
plt.show()

In [None]:
country = 'AMSTERDAM'
aux1 = df3[df3['user_office_name'] == country][['category_cleaned', 'gmv_value_brl']].groupby('category_cleaned').sum().reset_index()
aux1['%'] = aux1['gmv_value_brl'] / aux1['gmv_value_brl'].sum() * 100
plt.figure(figsize = (15,10))
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"{country} | Product Category: GMV BRL (%)")
plt.yticks(fontsize =16)
plt.xlabel("% GMV")
plt.ylabel("")
plt.xlim(0, 50)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(wspace=0.2)
plt.show()

### Producer Segment: Num of producers per segment

In [None]:
aux1 = df3[['producer_id', 'segment']].groupby('producer_id').nunique().reset_index()

In [None]:
print(f"Total producers: {len(aux1):,}")
print(f"Producers with a single segment: {len(aux1[aux1['segment'] == 1]):,}")
print(f"Producers with +2 segment change: {len(aux1[aux1['segment'] > 1]):,}")

In [None]:
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[['producer_id', 'segment']].groupby('segment').nunique().reset_index()
aux1['%'] = aux1['producer_id'] / aux1['producer_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'segment', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['segment']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f" Num producers: {(aux1['producer_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 100)
plt.show()

### Producer Segment: Num of transactions per segment

In [None]:
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[['purchase_id', 'segment']].groupby('segment').count().reset_index()
aux1['%'] = aux1['purchase_id'] / aux1['purchase_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'segment', x = '%', data = aux1, palette = palette_topics, order = ['SEED', 'SMALL', 'MEDIUM', 'LARGE']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f" Num transactions: {(aux1['purchase_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 60)
plt.show()

### Producers above/below 10k GMV BRL LTV: Num of producers

In [None]:
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[['producer_id', 'is_below_10_entire']].groupby('is_below_10_entire').nunique().reset_index()
aux1['%'] = aux1['producer_id'] / aux1['producer_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'is_below_10_entire', x = '%', data = aux1, palette = palette_topics, order = aux1.sort_values('%', ascending=False)['is_below_10_entire']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f" Num producers: {(aux1['producer_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 100)
plt.show()

### Producers above/below 10k GMV BRL LTV: Num of transactions

In [None]:
# get top 10 countries 
plt.figure(figsize = (30,10))
# get vol of users with purchase
aux1 = df3[['purchase_id', 'is_below_10_entire']].groupby('is_below_10_entire').count().reset_index()
aux1['%'] = aux1['purchase_id'] / aux1['purchase_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'is_below_10_entire', x = '%', data = aux1, palette = palette_topics);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f" Num transactions: {(aux1['purchase_id'].sum()):,}")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 110)
plt.show()

## Check producer cohort

In [None]:
# df3[(df3['gmv_2021'] == 0) & (df3['gmv_2022'] == 0)]
# df3[(df3['gmv_brl_exact'] == 0)]
# df3[(df3['gmv_brl_exact'].isna())]

In [None]:
aux1 = df3[['producer_id','is_below_10_entire','gmv_brl_exact', 'gmv_brl_entire', 'gmv_2021', 'gmv_2022', 'mean_gmv_ltv']].drop_duplicates()
aux1 = aux1[['is_below_10_entire', 'producer_id']].groupby('is_below_10_entire').count().reset_index()
aux1['%'] = aux1['producer_id'] / aux1['producer_id'].sum() * 100
aux1

In [None]:
aux1 = df3[['producer_id','is_below_10_exact','gmv_brl_exact', 'gmv_brl_entire', 'gmv_2021', 'gmv_2022', 'mean_gmv_ltv']].drop_duplicates()
aux1 = aux1[['is_below_10_exact', 'producer_id']].groupby('is_below_10_exact').count().reset_index()
aux1['%'] = aux1['producer_id'] / aux1['producer_id'].sum() * 100
aux1

In [None]:
aux1 = df3[['producer_id','is_below_10_mean','gmv_brl_exact', 'gmv_brl_entire', 'gmv_2021', 'gmv_2022', 'mean_gmv_ltv']].drop_duplicates()
aux1 = aux1[['is_below_10_mean', 'producer_id']].groupby('is_below_10_mean').count().reset_index()
aux1['%'] = aux1['producer_id'] / aux1['producer_id'].sum() * 100
aux1

In [None]:
get_descriptive_statistics(df3[['producer_id','gmv_brl_exact', 'gmv_brl_entire', 'gmv_2021', 'gmv_2022', 'mean_gmv_ltv']].drop_duplicates().drop('producer_id', axis = 1))

In [None]:
sns.distplot(df3['gmv_brl_exact'])
plt.show()

In [None]:
sns.distplot(df3['gmv_brl_entire'])
plt.show()

In [None]:
sns.distplot(df3['gmv_2021'])
plt.show()

In [None]:
sns.distplot(df3['gmv_2022'])
plt.show()

## Recompra (Reorder)

### What is the overall frequency of repurchase? (by customer country, producer office, producer segment, producers below 10k BRL LTV)

#### Overall

In [None]:
# get vol of users with purchase
aux1 = df3[['no_repurchase','repurchase',  'user_buyer_id']].groupby(['repurchase','no_repurchase']).nunique().reset_index()
aux1['%'] = aux1['user_buyer_id'] / aux1['user_buyer_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)
aux3 = aux1[aux1['no_repurchase'] < 15]
aux3['no_repurchase'] = aux3['no_repurchase'].astype(str)

# plot graph with annotations
plt.figure(figsize = (10,10))
sns.set_style('white')
splot = sns.barplot(y = 'no_repurchase', x = '%', data = aux3, palette = palette_bins);

for p in splot.patches:
    splot.annotate("%.3f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title("% users per volume of purchase")
plt.yticks(fontsize =16)
plt.xlabel(f"% of users (N = {(aux1['user_buyer_id'].sum()):,})")
plt.ylabel("# of purchases")
plt.xlim(0, 100)
plt.show()

In [None]:
# get vol of users with purchase
aux1 = df3[['no_repurchase','repurchase',  'user_buyer_id']].groupby(['repurchase','no_repurchase']).nunique().reset_index()
aux3 = aux1[aux1['no_repurchase'] != 1]
aux3['no_repurchase'] = aux3['no_repurchase'] - 1
# aux3['no_repurchase'] = aux3['no_repurchase'].astype(str)
aux3['%'] = aux3['user_buyer_id'] / aux3['user_buyer_id'].sum() * 100
aux3 = aux3.sort_values('%', ascending=False)
# calculate bins
aux3['bin_cut'] = aux3.apply(lambda x: '+4' if (x['no_repurchase'] >= 4) 
                             else x['no_repurchase'], axis = 1)
aux3['bin_cut'] = aux3['bin_cut'].astype("category")
aux3 = aux3[['bin_cut', 'user_buyer_id']].groupby(['bin_cut']).sum().reset_index()
aux3['%'] = aux3['user_buyer_id'] / aux3['user_buyer_id'].sum() * 100

In [None]:
# plot graph with annotations
plt.figure(figsize = (10,10))
sns.set_style('white')
splot = sns.barplot(y = 'bin_cut', x = '%', data = aux3, palette = {1: '#EF4E23', 2: '#9EA4AC',3: '#9EA4AC','+4': '#9EA4AC'});

for p in splot.patches:
    splot.annotate(" %.2f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = 50, color = '#707780')
    
# plt.title(f"Repurchase times (% buyers)\n" + f"N: {aux3['user_buyer_id'].sum():,}" , color =  '#707780' , fontsize = 35, loc = 'left')
splot.set_title(f"Repurchase times (% buyers)\n" + f"N: {aux3['user_buyer_id'].sum():,}" , color =  '#707780' , fontsize = 30, loc = 'left', pad=40)

plt.yticks(fontsize =50)
plt.ylabel("")
plt.xlim(0, 100)
splot.tick_params(labelsize=30)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 30)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.xlabel(f"{aux3['user_buyer_id'].sum():,}", color = '#707780', fontsize = 30)
plt.show()

#### Customer country

In [None]:
# get top 10 countries (buyer country)
top10 = list(df3[['user_country', 'purchase_id']].groupby('user_country').count().reset_index().sort_values('purchase_id', ascending=False)['user_country'][0:10])
plt.figure(figsize = (30,15))

for n, d in enumerate(top10):
    plt.subplot(2,5,n+1)
    # get vol of users with purchase
    aux1 = df3[df3['user_country'] == d][['no_repurchase','repurchase',  'user_buyer_id']].groupby(['repurchase','no_repurchase']).nunique().reset_index()
    aux1['%'] = aux1['user_buyer_id'] / aux1['user_buyer_id'].sum() * 100
    aux1 = aux1.sort_values('%', ascending=False)
    aux3 = aux1[aux1['no_repurchase'] < 15] 
    aux3['no_repurchase'] = aux3['no_repurchase'].astype(str)
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'no_repurchase', x = '%', data = aux3, palette = palette_bins);

    for p in splot.patches:
        splot.annotate("%.3f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | N: {(aux1['user_buyer_id'].sum()):,}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 120)
    if n == 0:
        plt.ylabel("# of purchases")
    if n == 5:
        plt.ylabel("# of purchases")

#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.3)
plt.show()

#### Producer office

In [None]:
# get office list (by producer)
country_list = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'][0:10])
plt.figure(figsize = (30,15))

for n, d in enumerate(country_list):
    plt.subplot(2,3,n+1)
    ### count purchases per user
    aux1 = df3[df3['user_office_name'] == d]
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')    
    
    # get vol of users with purchase
    aux1 = aux1[['no_repurchase_temp','repurchase_temp',  'user_buyer_id']].groupby(['repurchase_temp','no_repurchase_temp']).nunique().reset_index()
    aux1['%'] = aux1['user_buyer_id'] / aux1['user_buyer_id'].sum() * 100
    aux1 = aux1.sort_values('%', ascending=False)
    aux3 = aux1[aux1['no_repurchase_temp'] < 15] 
    aux3['no_repurchase_temp'] = aux3['no_repurchase_temp'].astype(str)
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'no_repurchase_temp', x = '%', data = aux3, palette = palette_bins);

    for p in splot.patches:
        splot.annotate("%.3f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | N: {(aux1['user_buyer_id'].sum()):,}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 120)
    if n == 0:
        plt.ylabel("# of purchases")
    if n == 3:
        plt.ylabel("# of purchases")
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.1)
plt.show()

#### Producer segment

In [None]:
# get producer segment list
plt.figure(figsize = (30,10))

for n, d in enumerate(['SEED', 'SMALL', 'MEDIUM','LARGE']):
    plt.subplot(1,4,n+1)
    ### count purchases per user
    aux1 = df3[df3['segment'] == d]
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')    
    
    # get vol of users with purchase
    aux1 = aux1[['no_repurchase_temp','repurchase_temp',  'user_buyer_id']].groupby(['repurchase_temp','no_repurchase_temp']).nunique().reset_index()
    aux1['%'] = aux1['user_buyer_id'] / aux1['user_buyer_id'].sum() * 100
    aux1 = aux1.sort_values('%', ascending=False)
    aux3 = aux1[aux1['no_repurchase_temp'] < 15] 
    aux3['no_repurchase_temp'] = aux3['no_repurchase_temp'].astype(str)
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'no_repurchase_temp', x = '%', data = aux3, palette = palette_bins);

    for p in splot.patches:
        splot.annotate("%.3f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | N: {(aux1['user_buyer_id'].sum()):,}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 120)
    if n == 0:
        plt.ylabel("# of purchases")
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.2)
plt.show()

#### Producers < 10k BRL LTV

In [None]:
# get producer segment list
plt.figure(figsize = (20,10))

for n, d in enumerate(['below', 'above']):
    plt.subplot(1,2,n+1)
    ### count purchases per user
    aux1 = df3[df3['is_below_10_entire'] == d]
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')    
    
    # get vol of users with purchase
    aux1 = aux1[['no_repurchase_temp','repurchase_temp',  'user_buyer_id']].groupby(['repurchase_temp','no_repurchase_temp']).nunique().reset_index()
    aux1['%'] = aux1['user_buyer_id'] / aux1['user_buyer_id'].sum() * 100
    aux1 = aux1.sort_values('%', ascending=False)
    aux3 = aux1[aux1['no_repurchase_temp'] < 15] 
    aux3['no_repurchase_temp'] = aux3['no_repurchase_temp'].astype(str)
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'no_repurchase_temp', x = '%', data = aux3, palette = palette_bins);

    for p in splot.patches:
        splot.annotate("%.3f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | N: {(aux1['user_buyer_id'].sum()):,}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 120)
    if n == 0:
        plt.ylabel("# of purchases")
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.2)
plt.show()

### How many customers repurchase? (by customer country, producer office, producer segment, producers below 10k BRL LTV)

#### Overall

In [None]:
# perc of GMV unique at each office
aux1 = df3[['gmv_value_brl', 'user_office_name']].groupby('user_office_name').sum()
aux1['%'] = aux1['gmv_value_brl'] / aux1['gmv_value_brl'].sum() * 100
aux1

In [None]:
# perc of users unique at each office
aux1 = df3[['user_buyer_id', 'user_office_name']].groupby('user_office_name').nunique()
aux1['%'] = aux1['user_buyer_id'] / aux1['user_buyer_id'].sum() * 100
aux1

In [None]:
# get vol of users with purchase
aux1 = df3[['no_repurchase','repurchase',  'user_buyer_id']].groupby(['repurchase','no_repurchase']).nunique().reset_index()
aux1['%'] = aux1['user_buyer_id'] / aux1['user_buyer_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)

# get users who repurchase
aux3 = aux1[['repurchase', 'user_buyer_id']].groupby('repurchase').sum().reset_index()
aux3['%'] = aux3['user_buyer_id'] / aux3['user_buyer_id'].sum() * 100

# plot graph with annotations
plt.figure(figsize = (5,5))
sns.set_style('white')
splot = sns.barplot(y = 'repurchase', x = '%', data = aux3, 
                    palette = palette_two
                    , order = ['single-purchase', 'repurchase']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title("% users")
plt.yticks(fontsize =16)
plt.xlabel(f"% (N = {(aux3['user_buyer_id'].sum()):,})")
plt.ylabel("")
plt.xlim(0, 100)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780')

# splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
plt.tick_params(labelsize=30)
plt.tick_params(axis = 'y', labelsize=30, pad = 20)
plt.show()

In [None]:
# get vol of users with purchase
aux1 = df3[['no_repurchase','repurchase',  'user_buyer_id']].groupby(['repurchase','no_repurchase']).nunique().reset_index()
aux1['%'] = aux1['user_buyer_id'] / aux1['user_buyer_id'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)

# get users who repurchase
aux3 = aux1[['repurchase', 'user_buyer_id']].groupby('repurchase').sum().reset_index()
aux3['%'] = aux3['user_buyer_id'] / aux3['user_buyer_id'].sum() * 100

# plot graph with annotations
plt.figure(figsize = (10,10))
sns.set_style('white')
splot = sns.barplot(y = 'repurchase', x = '%', data = aux3, 
                    palette = palette_two
                    , order = ['single-purchase', 'repurchase']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title("% users")
plt.yticks(fontsize =16)
plt.xlabel(f"% (N = {(aux3['user_buyer_id'].sum()):,})")
plt.ylabel("", color = 'darkgrey')
plt.xlim(0, 100)
# splot.axes.get_yaxis().set_visible(False)
# splot.spines['right'].set_visible(False)
# splot.spines['top'].set_visible(False)
# splot.spines['left'].set_visible(False)
# splot.spines['bottom'].set_visible(False)
# splot.axes.get_xaxis().set_visible(False)
plt.show()

#### Customer country

In [None]:
# get top 10 countries 
top10 = list(df3[['user_country', 'purchase_id']].groupby('user_country').count().reset_index().sort_values('purchase_id', ascending=False)['user_country'][0:10])
plt.figure(figsize = (30,10))

for n, d in enumerate(top10):
    plt.subplot(2,5,n+1)
    # get vol of users with purchase
    aux1 = df3[df3['user_country'] == d][['no_repurchase','repurchase',  'user_buyer_id']].groupby(['repurchase','no_repurchase']).nunique().reset_index()
    aux1['%'] = aux1['user_buyer_id'] / aux1['user_buyer_id'].sum() * 100
    aux1 = aux1.sort_values('%', ascending=False)
    # get users who repurchase
    aux3 = aux1[['repurchase', 'user_buyer_id']].groupby('repurchase').sum().reset_index()
    aux3['%'] = aux3['user_buyer_id'] / aux3['user_buyer_id'].sum() * 100
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'repurchase', x = '%', data = aux3, palette = palette_types, order = ['single-purchase', 'repurchase']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | N: {(aux1['user_buyer_id'].sum()):,}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 120)
    if n not in (0, 5):
        splot.axes.get_yaxis().set_visible(False)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.3)
plt.show()

#### Producer office

In [None]:
# get producer office
country_list = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])
plt.figure(figsize = (30,10))

for n, d in enumerate(country_list):
    plt.subplot(2,3,n+1)
    ### count purchases per user
    aux1 = df3[df3['user_office_name'] == d]
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')    
    
    # get vol of users with purchase
    aux1 = aux1[['no_repurchase_temp','repurchase_temp',  'user_buyer_id']].groupby(['repurchase_temp','no_repurchase_temp']).nunique().reset_index()
    aux1['%'] = aux1['user_buyer_id'] / aux1['user_buyer_id'].sum() * 100
    aux1 = aux1.sort_values('%', ascending=False)
    # get users who repurchase
    aux3 = aux1[['repurchase_temp', 'user_buyer_id']].groupby('repurchase_temp').sum().reset_index()
    aux3['%'] = aux3['user_buyer_id'] / aux3['user_buyer_id'].sum() * 100
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'repurchase_temp', x = '%', data = aux3, palette = palette_types, order = ['single-purchase', 'repurchase']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | N: {(aux1['user_buyer_id'].sum()):,}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 120)
    splot.axes.get_yaxis().set_visible(False)
    if n == 0:
        splot.axes.get_yaxis().set_visible(True)
    if n == 3:
        splot.axes.get_yaxis().set_visible(True)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.1)
plt.show()

#### Producer segment

In [None]:
# get top 10 countries 
segment_list = list(df3[['segment', 'purchase_id']].groupby('segment').count().reset_index().sort_values('purchase_id', ascending=False)['segment'])
plt.figure(figsize = (25,10))

for n, d in enumerate([ 'SEED',  'SMALL', 'MEDIUM', 'LARGE']):
    plt.subplot(1,4, n+1)
    ### count purchases per user
    aux1 = df3[df3['segment'] == d]
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')    
    
    # get vol of users with purchase
    aux1 = aux1[['no_repurchase_temp','repurchase_temp',  'user_buyer_id']].groupby(['repurchase_temp','no_repurchase_temp']).nunique().reset_index()
    aux1['%'] = aux1['user_buyer_id'] / aux1['user_buyer_id'].sum() * 100
    aux1 = aux1.sort_values('%', ascending=False)
    # get users who repurchase
    aux3 = aux1[['repurchase_temp', 'user_buyer_id']].groupby('repurchase_temp').sum().reset_index()
    aux3['%'] = aux3['user_buyer_id'] / aux3['user_buyer_id'].sum() * 100
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'repurchase_temp', x = '%', data = aux3, palette = palette_types, order =  ['single-purchase', 'repurchase']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | N: {(aux1['user_buyer_id'].sum()):,}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 120)
    splot.axes.get_yaxis().set_visible(False)
    if n == 0:
        splot.axes.get_yaxis().set_visible(True)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.2)
plt.show()

#### Producers < 10k BRL LTV

In [None]:
plt.figure(figsize = (25,10))

for n, d in enumerate(['below', 'above']):
    plt.subplot(1,2,n+1)
    ### count purchases per user
    aux1 = df3[df3['is_below_10_entire'] == d]
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')    
    
    # get vol of users with purchase
    aux1 = aux1[['no_repurchase_temp','repurchase_temp',  'user_buyer_id']].groupby(['repurchase_temp','no_repurchase_temp']).nunique().reset_index()
    aux1['%'] = aux1['user_buyer_id'] / aux1['user_buyer_id'].sum() * 100
    aux1 = aux1.sort_values('%', ascending=False)
    # get users who repurchase
    aux3 = aux1[['repurchase_temp', 'user_buyer_id']].groupby('repurchase_temp').sum().reset_index()
    aux3['%'] = aux3['user_buyer_id'] / aux3['user_buyer_id'].sum() * 100
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'repurchase_temp', x = '%', data = aux3, palette = palette_two, order = ['single-purchase', 'repurchase']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | N: {(aux1['user_buyer_id'].sum()):,}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 120)
    splot.axes.get_yaxis().set_visible(False)
    if n == 0:
        splot.axes.get_yaxis().set_visible(True)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.2)
plt.show()

In [None]:
rates = pd.DataFrame()
for n, d in enumerate(['below', 'above']):
#     plt.subplot(2,1,n+1)
    ### count purchases per user
    aux1 = df3[df3['is_below_10_entire'] == d]
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')    
    
    # get vol of users with purchase
    aux1 = aux1[['no_repurchase_temp','repurchase_temp',  'user_buyer_id']].groupby(['repurchase_temp','no_repurchase_temp']).nunique().reset_index()
    aux1['%'] = aux1['user_buyer_id'] / aux1['user_buyer_id'].sum() * 100
    aux1 = aux1.sort_values('%', ascending=False)
    # get users who repurchase
    aux3 = aux1[['repurchase_temp', 'user_buyer_id']].groupby('repurchase_temp').sum().reset_index()
    aux3['%'] = aux3['user_buyer_id'] / aux3['user_buyer_id'].sum() * 100
    aux4 = aux3[aux3['repurchase_temp'] == 'repurchase'][['%', 'user_buyer_id']]
    aux4['type'] = d
    rates = pd.concat([rates,aux4])


In [None]:

plt.figure(figsize = (15,10))
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'type', x = '%', data = rates, palette = palette_above, order = ['above', 'below']);

for p in splot.patches:
    splot.annotate(" %.2f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = 50, color = '#707780')
splot.set_title(f"Reorder Rate" , color =  '#707780' , fontsize = 40, loc = 'left', pad=50)

plt.yticks(fontsize =60)
plt.xlabel("")
plt.ylabel("")
plt.xlim(0, 50)
splot.tick_params(labelsize=30)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 30)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
plt.show()

### Repurchase Rate across months

In [None]:
# df3[['user_buyer_id', 'year_month', 'purchase_id']].groupby(['user_buyer_id', 'year_month']).count().reset_index().head(50)

In [None]:
# get purchase volume by user and period
aux1 = df3[['user_buyer_id', 'year_month', 'purchase_id']].groupby(['user_buyer_id', 'year_month']).count().reset_index()
aux2 = aux1[aux1['purchase_id'] > 1][['year_month', 'user_buyer_id']].groupby(['year_month']).nunique()
aux2

In [None]:
df3[df3['year_month'] == '2022-06']['user_buyer_id'].nunique()

### [CHECK] How much GMV is generated by customers who repurchase? (by customer country, producer office, producer segment, producers below 10k BRL LTV)

#### Overall

In [None]:
# get vol of users with purchase
aux1 = df3[['repurchase',  'gmv_value_brl']].groupby(['repurchase']).sum().reset_index()
aux1['%'] = aux1['gmv_value_brl'] / aux1['gmv_value_brl'].sum() * 100
aux1 = aux1.sort_values('%', ascending=False)

In [None]:
# plot graph with annotations
plt.figure(figsize = (10,10))

sns.set_style('white')
splot = sns.barplot(y = 'repurchase', x = '%', data = aux1, palette = palette_types, order = ['single-purchase', 'repurchase']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title("repurchase vs single-purchase GMV (%)")
plt.yticks(fontsize =16)
plt.xlabel(f"% (Total: {millify(aux1['gmv_value_brl'].sum())})")
plt.ylabel("")
plt.xlim(0, 100)
plt.show()

In [None]:
# get dataset for regression
aux1 = df3[['user_buyer_id', 'repurchase', 'no_repurchase', 'gmv_value_brl']].groupby(['user_buyer_id', 'repurchase', 'no_repurchase']).sum().reset_index()
aux1 = aux1[aux1['gmv_value_brl'] > 0]
print(f"99th perc. gmv was removed: {np.percentile(aux1['gmv_value_brl'], 99):,.2f} BRL")
aux1 = aux1[aux1['gmv_value_brl'] < np.percentile(aux1['gmv_value_brl'], 99)]
aux1['repurchase'] = aux1['repurchase'].apply(lambda x : 1 if x == 'repurchase' else 0)
aux1['ln'] = np.log(aux1['gmv_value_brl'])

# aux1 = aux1[(aux1['no_repurchase'] <= 2)]
aux1.to_csv('data/gmv_sum_repurchase.csv')


In [None]:
def regress(y, X): 
    return np.linalg.inv(X.T.dot(X)).dot(X.T.dot(y))

X = aux1[['no_repurchase']].assign(intercep=1)
t = aux1["repurchase"]
y = aux1["ln"]

beta_aux = regress(t, X)
t_tilde = t - X.dot(beta_aux)
kappa = t_tilde.cov(y) / t_tilde.var()
kappa

In [None]:
((math.exp(0.79)- 1) * 100)

In [None]:
# get correct % changes: https://davegiles.blogspot.com/2011/03/dummies-for-dummies.html
perc_change = ((math.exp(0.79)- 1) * 100)
factor = (perc_change / 100) + 1

print(f"Changing to repurchase buyers increase gmv by: {perc_change:.2f}% ({factor:.2f}x)")

#### Customer country

In [None]:
# get top 10 countries 
top10 = list(df3[['user_country', 'purchase_id']].groupby('user_country').count().reset_index().sort_values('purchase_id', ascending=False)['user_country'][0:10])
plt.figure(figsize = (30,10))

for n, d in enumerate(top10):
    plt.subplot(2,5,n+1)
    # get vol of users with purchase
    aux1 = df3[df3['user_country'] == d][['repurchase',  'gmv_value_brl']].groupby(['repurchase']).sum().reset_index()
    aux1['%'] = aux1['gmv_value_brl'] / aux1['gmv_value_brl'].sum() * 100
    aux1 = aux1.sort_values('%', ascending=False)
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'repurchase', x = '%', data = aux1, palette = palette_types, order = ['single-purchase', 'repurchase']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | {millify(aux1['gmv_value_brl'].sum())}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 120)
    if n not in (0, 5):
        splot.axes.get_yaxis().set_visible(False)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.3)
plt.show()

#### Producer office

In [None]:
# get country
country_list = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])
plt.figure(figsize = (30,10))

for n, d in enumerate(country_list):
    plt.subplot(2,3,n+1)
    ### count purchases per user
    aux1 = df3[df3['user_office_name'] == d].drop(['repurchase', 'no_repurchase'])
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase'] = aux1['no_repurchase'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')    
    
    # get vol of users with purchase
    aux1 = aux1[['repurchase',  'gmv_value_brl']].groupby(['repurchase']).sum().reset_index()
    aux1['%'] = aux1['gmv_value_brl'] / aux1['gmv_value_brl'].sum() * 100
    aux1 = aux1.sort_values('%', ascending=False)
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'repurchase', x = '%', data = aux1, palette = palette_types, order = ['single-purchase', 'repurchase']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | {millify(aux1['gmv_value_brl'].sum())}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 120)
    splot.axes.get_yaxis().set_visible(False)
    if n == 0:
        splot.axes.get_yaxis().set_visible(True)
    if n == 3:
        splot.axes.get_yaxis().set_visible(True)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.2)
plt.show()

In [None]:
# get dataset for regression
temp = df3[['user_buyer_id',  'gmv_value_brl']].groupby(['user_buyer_id']).sum().reset_index()
temp = temp[temp['gmv_value_brl'] > 0]
temp = temp[temp['gmv_value_brl'] < np.percentile(temp['gmv_value_brl'], 99)]
temp = df3[df3['user_buyer_id'].isin(list(temp['user_buyer_id']))]

country_list = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])
for n, d in enumerate(country_list):
    print(d)
    ### count purchases per user
    aux1 = temp[temp['user_office_name'] == d].drop(['no_repurchase', 'repurchase'], axis =1 )
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase']
    aux3 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux3['repurchase'] = aux3['no_repurchase'].apply(lambda x : 1 if x != 1 else 0)    
    # get dataset for regression
    aux4 = aux3[['user_buyer_id', 'repurchase',  'gmv_value_brl']].groupby(['user_buyer_id', 'repurchase']).sum().reset_index()
    aux4 = aux4[aux4['gmv_value_brl'] > 0]
    aux4['ln'] = np.log(aux4['gmv_value_brl'])
    aux2 = aux3[['user_buyer_id', 'no_repurchase']].drop_duplicates()
    aux4 = aux4.merge(aux2, how = 'left', on = 'user_buyer_id')
    aux4.to_csv(f'data/gmv_sum_repurchase_{d}.csv')

In [None]:
# get correct % changes: https://davegiles.blogspot.com/2011/03/dummies-for-dummies.html
dict_country = {
    'Global':round(((math.exp(0.79)- 1) * 100), 2),
    'Brazil':round(((math.exp(0.83)- 1) * 100), 2),
    'Colombia':round(((math.exp(0.63)- 1) * 100), 2),
    'Mexico':round(((math.exp(0.92)- 1) * 100), 2),
    'Spain':round(((math.exp(0.64)- 1) * 100), 2),
    'USA':round(((math.exp(0.88)- 1) * 100), 2),
    'Amsterdam':round(((math.exp(0.71)- 1) * 100), 2)
               }
for n in dict_country.keys():
    print(f"{n} : Changing to repurchase buyers increase gmv by: {dict_country[n]}%")


In [None]:
# plot perc changes 
aux1 = pd.DataFrame(dict_country.items(), columns = ['local', '%'])
aux1['factor'] = (aux1['%'] / 100) + 1
list_order = list(aux1.sort_values('factor', ascending=False)['local'])
list_order.remove('Global')
list_order.insert(0, 'Global')
plt.figure(figsize = (10,15))
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'local', x = 'factor', data = aux1, palette = {'Global': '#EF4E23' , 'Brazil': '#9EA4AC',
                                                                 'Colombia': '#9EA4AC', 'Mexico': '#9EA4AC',
                                                                 'Spain': '#9EA4AC', 'USA': '#9EA4AC', 
                                                                 'Amsterdam': '#9EA4AC'} , order = list_order);

for p in splot.patches:
    splot.annotate(" %.2f" % p.get_width() + 'x', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = 50, color = '#707780')
# plt.title(f"Generated GMV by buyers who repurchase", color =  '#707780' , fontsize = 40)
splot.set_title(f"How much GMV do buyers who repurchase\n" + "generate over buyers who do not?\n" + "(by producer office)", color =  '#707780' , fontsize = 40, loc = 'left', pad=40)

plt.yticks(fontsize =50)
plt.xlabel("")
plt.ylabel("")
plt.xlim(0, 6)
splot.tick_params(labelsize=30)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 30)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
plt.show()

#### Producer segment

In [None]:
# get country
plt.figure(figsize = (30,10))

for n, d in enumerate([ 'SEED',  'SMALL', 'MEDIUM', 'LARGE']):
    plt.subplot(1,4,n+1)
    ### count purchases per user
    aux1 = df3[df3['segment'] == d]
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')    
    
    # get vol of users with purchase
    aux1 = aux1[['repurchase_temp',  'gmv_value_brl']].groupby(['repurchase_temp']).sum().reset_index()
    aux1['%'] = aux1['gmv_value_brl'] / aux1['gmv_value_brl'].sum() * 100
    aux1 = aux1.sort_values('%', ascending=False)

    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'repurchase_temp', x = '%', data = aux1, palette = palette_types, order = ['single-purchase', 'repurchase']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | {millify(aux1['gmv_value_brl'].sum())}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 120)
    splot.axes.get_yaxis().set_visible(False)
    if n == 0:
        splot.axes.get_yaxis().set_visible(True)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.2)
plt.show()

In [None]:
for n, d in enumerate([ 'SEED',  'SMALL', 'MEDIUM', 'LARGE']):
    print(d)
    ### count purchases per user
    aux1 = temp[temp['segment'] == d].drop(['no_repurchase', 'repurchase'], axis =1 )
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase']
    aux3 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux3['repurchase'] = aux3['no_repurchase'].apply(lambda x : 1 if x != 1 else 0)    
    # get dataset for regression
    aux4 = aux3[['user_buyer_id', 'repurchase',  'gmv_value_brl']].groupby(['user_buyer_id', 'repurchase']).sum().reset_index()
    aux4 = aux4[aux4['gmv_value_brl'] > 0]
    aux4['ln'] = np.log(aux4['gmv_value_brl'])
    aux2 = aux3[['user_buyer_id', 'no_repurchase']].drop_duplicates()
    aux4 = aux4.merge(aux2, how = 'left', on = 'user_buyer_id')
    aux4.to_csv(f'data/gmv_sum_repurchase_{d}.csv')


In [None]:
for n, d in enumerate([ 'SEED',  'SMALL', 'MEDIUM', 'LARGE']):
    print(d)
    ### count purchases per user
    aux1 = temp[temp['segment'] == d].drop(['no_repurchase', 'repurchase'], axis =1 )
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase']
    aux3 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux3['repurchase'] = aux3['no_repurchase'].apply(lambda x : 1 if x != 1 else 0)    
  

In [None]:
# get correct % changes: https://davegiles.blogspot.com/2011/03/dummies-for-dummies.html
dict_country = {
    'Seed':round(((math.exp(0.66)- 1) * 100), 2),
    'Small':round(((math.exp(0.63)- 1) * 100), 2),
    'Medium':round(((math.exp(0.66)- 1) * 100), 2),
    'Large':round(((math.exp(1.43)- 1) * 100), 2),
               }
for n in dict_country.keys():
    print(f"{n} : Changing to repurchase buyers increase gmv by: {dict_country[n]}%")
# plot perc changes 
aux1 = pd.DataFrame(dict_country.items(), columns = ['local', '%'])
aux1['factor'] = (aux1['%'] / 100) + 1

In [None]:
plt.figure(figsize = (10,15))
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'local', x = 'factor', data = aux1, palette = {'Seed': '#9EA4AC',
                                                                 'Small': '#9EA4AC', 'Medium': '#9EA4AC',
                                                                 'Large': '#9EA4AC'}, 
                            order = [ 'Large',  'Medium', 'Small', 'Seed']);

for p in splot.patches:
    splot.annotate(" %.2f" % p.get_width() + 'x', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = 50, color = '#707780')
# plt.title(f"Generated GMV by buyers who repurchase", color =  '#707780' , fontsize = 40)
splot.set_title(f"How much GMV do buyers who repurchase\n" + "generate over buyers who do not?\n" + "(by producer segment)", color =  '#707780' , fontsize = 35, loc = 'left', pad=30)

plt.yticks(fontsize =50)
plt.xlabel("")
plt.ylabel("")
plt.xlim(0, 6)
splot.tick_params(labelsize=30)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 30)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
plt.show()

#### Producers < 10k BRL LTV

In [None]:
# get country
plt.figure(figsize = (30,10))

for n, d in enumerate(['below', 'above']):
    plt.subplot(1,2,n+1)
    ### count purchases per user
    aux1 = df3[df3['is_below_10_entire'] == d]
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')    
    
    # get vol of users with purchase
    aux1 = aux1[['repurchase_temp',  'gmv_value_brl']].groupby(['repurchase_temp']).sum().reset_index()
    aux1['%'] = aux1['gmv_value_brl'] / aux1['gmv_value_brl'].sum() * 100
    aux1 = aux1.sort_values('%', ascending=False)

    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'repurchase_temp', x = '%', data = aux1, palette = palette_types, order = ['single-purchase', 'repurchase']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | {millify(aux1['gmv_value_brl'].sum())}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 120)
    splot.axes.get_yaxis().set_visible(False)
    if n == 0:
        splot.axes.get_yaxis().set_visible(True)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.2)
plt.show()

In [None]:
# get df for regression
for n, d in enumerate(['below',  'above']):
    ### count purchases per user
    aux1 = temp[temp['is_below_10_entire'] == d].drop(['no_repurchase', 'repurchase'], axis =1 )
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase']
    aux3 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux3['repurchase'] = aux3['no_repurchase'].apply(lambda x : 1 if x != 1 else 0)    
    # get dataset for regression
    aux4 = aux3[['user_buyer_id', 'repurchase',  'gmv_value_brl']].groupby(['user_buyer_id', 'repurchase']).sum().reset_index()
    aux4 = aux4[aux4['gmv_value_brl'] > 0]
    aux4['ln'] = np.log(aux4['gmv_value_brl'])
    aux2 = aux3[['user_buyer_id', 'no_repurchase']].drop_duplicates()
    aux4 = aux4.merge(aux2, how = 'left', on = 'user_buyer_id')
    aux4.to_csv(f'data/gmv_sum_repurchase_PROD_LTV_{d}.csv')

In [None]:
# get correct % changes: https://davegiles.blogspot.com/2011/03/dummies-for-dummies.html
dict_country = {
    'below':round(((math.exp(0.34)- 1) * 100), 2),
    'above':round(((math.exp(0.77)- 1) * 100), 2),
               }
for n in dict_country.keys():
    print(f"{n} : Changing to repurchase buyers increase gmv by: {dict_country[n]}%")
# plot perc changes 
aux1 = pd.DataFrame(dict_country.items(), columns = ['local', '%'])
aux1['factor'] = (aux1['%'] / 100) + 1

In [None]:
plt.figure(figsize = (10,15))
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'local', x = 'factor', data = aux1, palette = {'below': '#9EA4AC',
                                                                 'above': '#9EA4AC'}, 
                            order = [ 'above', 'below']);

for p in splot.patches:
    splot.annotate(" %.2f" % p.get_width() + 'x', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = 50, color = '#707780')
# plt.title(f"Generated GMV by buyers who repurchase", color =  '#707780' , fontsize = 40)
splot.set_title(f"How much GMV do buyers who repurchase\n" + "generate over buyers who do not?\n" + "(by producer LTV cohort)", color =  '#707780' , fontsize = 35, loc = 'left', pad=30)

plt.yticks(fontsize =50)
plt.xlabel("")
plt.ylabel("")
plt.xlim(0, 6)
splot.tick_params(labelsize=30)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 30)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
plt.show()

### How many customers repurchase from the same producer? (by customer country, producer office, producers below 10k BRL LTV)

#### Overall

In [None]:
aux1 = df3[['repurchase_same_producer', 'user_buyer_id']].drop_duplicates().groupby(['repurchase_same_producer']).count().reset_index()
aux1['%'] = aux1['user_buyer_id'] / aux1['user_buyer_id'].sum()*100

# plot graph with annotations
plt.figure(figsize = (10,10))
sns.set_style('white')
splot = sns.barplot(y = 'repurchase_same_producer', x = '%', data = aux1, palette = palette_types, order = ['no_repurchase', 'diff_producer_repurchase', 'same_producer_repurchase']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title("Repurchase: same vs different producer (%)")
plt.yticks(fontsize =16)
plt.xlabel(f"% (N: {aux1['user_buyer_id'].sum():,} users)")
plt.ylabel("")
plt.xlim(0, 100)
plt.show()

In [None]:
aux1 = df3[df3['repurchase'] == 'repurchase'][['repurchase_same_producer', 'user_buyer_id']].drop_duplicates().groupby(['repurchase_same_producer']).count().reset_index()
aux1['%'] = aux1['user_buyer_id'] / aux1['user_buyer_id'].sum()*100

# plot graph with annotations
plt.figure(figsize = (10,10))
sns.set_style('white')
splot = sns.barplot(y = 'repurchase_same_producer', x = '%', data = aux1, palette = palette_types, order = aux1.sort_values('%', ascending=False)['repurchase_same_producer']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title("Repurchase: same vs different producer (%)")
plt.yticks(fontsize =16)
plt.xlabel(f"% (N: {aux1['user_buyer_id'].sum():,} users)")
plt.ylabel("")
plt.xlim(0, 100)
plt.show()

In [None]:
aux1 = df3[df3['repurchase'] == 'repurchase'][['repurchase_same_producer', 'user_buyer_id']].drop_duplicates().groupby(['repurchase_same_producer']).count().reset_index()
aux1['%'] = aux1['user_buyer_id'] / aux1['user_buyer_id'].sum()*100
aux1['repurchase_same_producer'] = aux1['repurchase_same_producer'].apply(lambda x : 'different' if x == 'diff_producer_repurchase' else 'same')

# plot graph with annotations
plt.figure(figsize = (10,10))
sns.set_style('white')
splot = sns.barplot(y = 'repurchase_same_producer', x = '%', data = aux1, palette = {'different': '#9EA4AC', 
                                                                                     'same': '#9EA4AC'
                                                                                     }, order = ['different', 'same']);

for p in splot.patches:
    splot.annotate(" %.2f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = 50, color = '#707780')
    
# plt.title(f"Repurchase times (% buyers)\n" + f"N: {aux3['user_buyer_id'].sum():,}" , color =  '#707780' , fontsize = 35, loc = 'left')
splot.set_title(f"From buyers who repurchase, how many\n" + "did it with the same producer?\n" + f"N: {aux1['user_buyer_id'].sum():,}" , color =  '#707780' , fontsize = 30, loc = 'left', pad=40)

plt.yticks(fontsize =50)
plt.ylabel("")
plt.xlim(0, 100)
splot.tick_params(labelsize=30)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 30)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.xlabel(f"{aux3['user_buyer_id'].sum():,}", color = '#707780', fontsize = 30)
plt.show()

#### Customer country (all users)

In [None]:
# get top 10 countries 
top10 = list(df3[['user_country', 'purchase_id']].groupby('user_country').count().reset_index().sort_values('purchase_id', ascending=False)['user_country'][0:10])
plt.figure(figsize = (30,10))

for n, d in enumerate(top10):
    plt.subplot(2,5,n+1)
    ### count purchases per user
    aux1 = df3[df3['user_country'] == d]
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')

    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')

    ### flag whether a user repurchased from same producer at least once 
    aux2 = aux1[aux1['repurchase_temp'] == 'repurchase'][['purchase_id', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).count().reset_index().reset_index()
    aux2['same_prod_temp'] = aux2['purchase_id'].apply(lambda x : 'repurchased' if x > 1 else 'did not repurchase')
    # get users who repurchased from same producer
    aux2 = aux2[aux2['same_prod_temp'] == 'repurchased'][['user_buyer_id']].drop_duplicates()
    aux2['repurchase_same_producer_temp'] = 'same_producer_repurchase'
    # get users who did not repurchased
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    aux1.loc[aux1['repurchase_temp'] == 'single-purchase', 'repurchase_same_producer_temp'] = 'no_repurchase'
    aux1['repurchase_same_producer_temp'] = aux1['repurchase_same_producer_temp'].fillna('diff_producer_repurchase')

    aux2 = aux1[['repurchase_same_producer_temp', 'user_buyer_id']].drop_duplicates().groupby(['repurchase_same_producer_temp']).count().reset_index()
    aux2['%'] = aux2['user_buyer_id'] / aux2['user_buyer_id'].sum()*100    
    
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'repurchase_same_producer_temp', x = '%', data = aux2, palette = palette_types, order  = ['no_repurchase', 'diff_producer_repurchase', 'same_producer_repurchase']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | N: {(aux2['user_buyer_id'].sum()):,}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 120)
    if n not in (0,5):
        splot.axes.get_yaxis().set_visible(False)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.2)
plt.show()

#### Customer country (only users who repurchase)

In [None]:
# get top 10 countries 
top10 = list(df3[['user_country', 'purchase_id']].groupby('user_country').count().reset_index().sort_values('purchase_id', ascending=False)['user_country'][0:10])
plt.figure(figsize = (30,10))

for n, d in enumerate(top10):
    plt.subplot(2,5,n+1)
    ### count purchases per user
    aux1 = df3[df3['user_country'] == d]
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')

    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')

    ### flag whether a user repurchased from same producer at least once 
    aux2 = aux1[aux1['repurchase_temp'] == 'repurchase'][['purchase_id', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).count().reset_index().reset_index()
    aux2['same_prod_temp'] = aux2['purchase_id'].apply(lambda x : 'repurchased' if x > 1 else 'did not repurchase')
    # get users who repurchased from same producer
    aux2 = aux2[aux2['same_prod_temp'] == 'repurchased'][['user_buyer_id']].drop_duplicates()
    aux2['repurchase_same_producer_temp'] = 'same_producer_repurchase'
    # get users who did not repurchased
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    aux1.loc[aux1['repurchase_temp'] == 'single-purchase', 'repurchase_same_producer_temp'] = 'no_repurchase'
    aux1['repurchase_same_producer_temp'] = aux1['repurchase_same_producer_temp'].fillna('diff_producer_repurchase')

    aux2 = aux1[aux1['repurchase_temp'] == 'repurchase'][['repurchase_same_producer_temp', 'user_buyer_id']].drop_duplicates().groupby(['repurchase_same_producer_temp']).count().reset_index()
    aux2['%'] = aux2['user_buyer_id'] / aux2['user_buyer_id'].sum()*100
    
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'repurchase_same_producer_temp', x = '%', data = aux2, palette = palette_types, order = ['diff_producer_repurchase', 'same_producer_repurchase']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | N: {(aux2['user_buyer_id'].sum()):,}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 120)
    if n not in (0, 5):
        splot.axes.get_yaxis().set_visible(False)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.2)
plt.show()

#### Producer office (all users)

- For this analysis, 1st and 2nd purchases can be from any producer within the same office.
- Premisse: if a 2nd purchase was from a producer from a different office, this purchase will be discarded. If the 3rd+ purchase comes from the same producer office, then it will be considered as the 2nd purchase.

In [None]:
# get office_list
office_list = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])
plt.figure(figsize = (30,10))

for n, d in enumerate(office_list):
    plt.subplot(2,3,n+1)
    ### count purchases per user
    aux1 = df3[df3['user_office_name'] == d]
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')

    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')

    ### flag whether a user repurchased from same producer at least once 
    aux2 = aux1[aux1['repurchase_temp'] == 'repurchase'][['purchase_id', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).count().reset_index().reset_index()
    aux2['same_prod_temp'] = aux2['purchase_id'].apply(lambda x : 'repurchased' if x > 1 else 'did not repurchase')
    # get users who repurchased from same producer
    aux2 = aux2[aux2['same_prod_temp'] == 'repurchased'][['user_buyer_id']].drop_duplicates()
    aux2['repurchase_same_producer_temp'] = 'same_producer_repurchase'
    # get users who did not repurchased
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    aux1.loc[aux1['repurchase_temp'] == 'single-purchase', 'repurchase_same_producer_temp'] = 'no_repurchase'
    aux1['repurchase_same_producer_temp'] = aux1['repurchase_same_producer_temp'].fillna('diff_producer_repurchase')

    aux2 = aux1[['repurchase_same_producer_temp', 'user_buyer_id']].drop_duplicates().groupby(['repurchase_same_producer_temp']).count().reset_index()
    aux2['%'] = aux2['user_buyer_id'] / aux2['user_buyer_id'].sum()*100    
    
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'repurchase_same_producer_temp', x = '%', data = aux2, palette = palette_types, order = ['no_repurchase', 'diff_producer_repurchase', 'same_producer_repurchase']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | N: {(aux2['user_buyer_id'].sum()):,}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 120)
    splot.axes.get_yaxis().set_visible(False)
    if n == 0:
        splot.axes.get_yaxis().set_visible(True)
    if n == 3:
        splot.axes.get_yaxis().set_visible(True)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.2)
plt.show()

#### Producer office (only users who repurchase)

In [None]:
# get office
office_list = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])
plt.figure(figsize = (30,10))

for n, d in enumerate(office_list):
    plt.subplot(2,3,n+1)
    ### count purchases per user
    aux1 = df3[df3['user_office_name'] == d]
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')

    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')

    ### flag whether a user repurchased from same producer at least once 
    aux2 = aux1[aux1['repurchase_temp'] == 'repurchase'][['purchase_id', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).count().reset_index().reset_index()
    aux2['same_prod_temp'] = aux2['purchase_id'].apply(lambda x : 'repurchased' if x > 1 else 'did not repurchase')
    # get users who repurchased from same producer
    aux2 = aux2[aux2['same_prod_temp'] == 'repurchased'][['user_buyer_id']].drop_duplicates()
    aux2['repurchase_same_producer_temp'] = 'same_producer_repurchase'
    # get users who did not repurchased
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    aux1.loc[aux1['repurchase_temp'] == 'single-purchase', 'repurchase_same_producer_temp'] = 'no_repurchase'
    aux1['repurchase_same_producer_temp'] = aux1['repurchase_same_producer_temp'].fillna('diff_producer_repurchase')

    aux2 = aux1[aux1['repurchase_temp'] == 'repurchase'][['repurchase_same_producer_temp', 'user_buyer_id']].drop_duplicates().groupby(['repurchase_same_producer_temp']).count().reset_index()
    aux2['%'] = aux2['user_buyer_id'] / aux2['user_buyer_id'].sum()*100
    
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'repurchase_same_producer_temp', x = '%', data = aux2, palette = palette_types, order = ['diff_producer_repurchase', 'same_producer_repurchase']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | N: {(aux2['user_buyer_id'].sum()):,}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 120)
    splot.axes.get_yaxis().set_visible(False)
    if n == 0:
        splot.axes.get_yaxis().set_visible(True)
    if n == 3:
        splot.axes.get_yaxis().set_visible(True)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.2)
plt.show()

In [None]:
# get office
office_list = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])
### global
global_df = df3[df3['repurchase'] == 'repurchase'][['repurchase_same_producer', 'user_buyer_id']].drop_duplicates().groupby(['repurchase_same_producer']).count().reset_index()
global_df['%'] = global_df['user_buyer_id'] / global_df['user_buyer_id'].sum()*100
global_df = global_df[global_df['repurchase_same_producer'] != 'diff_producer_repurchase']
global_df['local'] = 'Global'

for n, d in enumerate(office_list):
    ### count purchases per user
    aux1 = df3[df3['user_office_name'] == d].drop(['no_repurchase', 'repurchase','repurchase_same_producer'], axis = 1)
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')

    ### flag repurchase
    aux1['repurchase'] = aux1['no_repurchase'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')

    ### flag whether a user repurchased from same producer at least once 
    aux2 = aux1[aux1['repurchase'] == 'repurchase'][['purchase_id', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).count().reset_index().reset_index()
    aux2['same_prod'] = aux2['purchase_id'].apply(lambda x : 'repurchased' if x > 1 else 'did not repurchase')
    # get users who repurchased from same producer
    aux2 = aux2[aux2['same_prod'] == 'repurchased'][['user_buyer_id']].drop_duplicates()
    aux2['repurchase_same_producer'] = 'same_producer_repurchase'
    # get users who did not repurchased
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    aux1.loc[aux1['repurchase'] == 'single-purchase', 'repurchase_same_producer'] = 'no_repurchase'
    aux1['repurchase_same_producer'] = aux1['repurchase_same_producer'].fillna('diff_producer_repurchase')

    aux2 = aux1[aux1['repurchase'] == 'repurchase'][['repurchase_same_producer', 'user_buyer_id']].drop_duplicates().groupby(['repurchase_same_producer']).count().reset_index()
    aux2['%'] = aux2['user_buyer_id'] / aux2['user_buyer_id'].sum()*100
    aux2 = aux2[aux2['repurchase_same_producer'] == 'same_producer_repurchase']
    aux2['local'] = d
    global_df = pd.concat([global_df, aux2])
global_df['local'] = global_df['local'].str.title()
global_df.loc[global_df['local'] == 'Usa', 'local'] = 'USA'
global_df

In [None]:
# get number of producers
aux1 = df3[['user_office_name', 'producer_id']].groupby('user_office_name').nunique().reset_index()
aux1.columns = ['local', 'num_producers']
aux1['local'] = aux1['local'].str.title()
aux1.loc[aux1['local'] == 'Usa', 'local'] = 'USA'
global_df = global_df.merge(aux1, how = 'left', on = 'local')
global_df.loc[global_df['local'] == 'Global', 'num_producers'] = aux1['num_producers'].sum()
global_df

In [None]:
list(global_df['local'])

In [None]:
for p in splot.patches:
    print(p)

In [None]:
global_df

In [None]:
0.6+0.8/2

In [None]:
list_order = list(global_df.sort_values('%', ascending=False)['local'])
list_order.remove('Global')
list_order.insert(0, 'Global')
plt.figure(figsize = (10,15))
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'local', x = '%', data = global_df, palette = {'Global': '#EF4E23' , 'Brazil': '#9EA4AC',
                                                                 'Colombia': '#9EA4AC', 'Mexico': '#9EA4AC',
                                                                 'Spain': '#9EA4AC', 'USA': '#9EA4AC', 
                                                                 'Amsterdam': '#9EA4AC'} , order = list_order);

for p in splot.patches:
    splot.annotate(" %.2f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = 50, color = '#707780')
# plt.title(f"Generated GMV by buyers who repurchase", color =  '#707780' , fontsize = 40)
splot.set_title(f"% of buyers who repurchase from\n" + "the same producer\n" + "(from all buyers who repurchase)", color =  '#707780' , fontsize = 40, loc = 'left', pad=30)

for n in list(global_df['local']):
    if n == 'Global':
        splot.annotate(f"(N: {(int(list(global_df[global_df['local'] == n]['num_producers'])[0])):,})", 
                       xy = (27, 6.0), xytext = (65,0.1), color = '#707780', fontsize = 25)
    if n == 'Amsterdam':
        splot.annotate(f"(N: {(int(list(global_df[global_df['local'] == n]['num_producers'])[0])):,})", 
                       xy = (27, 6.0), xytext = (125,1.1), color = '#707780', fontsize = 25)
    if n == 'USA':
        splot.annotate(f"(N: {(int(list(global_df[global_df['local'] == n]['num_producers'])[0])):,})", 
                       xy = (27, 6.0), xytext = (122,2.1), color = '#707780', fontsize = 25)
    if n == 'Spain':
        splot.annotate(f"(N: {(int(list(global_df[global_df['local'] == n]['num_producers'])[0])):,})", 
                       xy = (27, 6.0), xytext = (115,3.1), color = '#707780', fontsize = 25)
    if n == 'Mexico':
        splot.annotate(f"(N: {(int(list(global_df[global_df['local'] == n]['num_producers'])[0])):,})", 
                       xy = (27, 6.0), xytext = (115,4.1), color = '#707780', fontsize = 25)
    if n == 'Colombia':
        splot.annotate(f"(N: {(int(list(global_df[global_df['local'] == n]['num_producers'])[0])):,})", 
                       xy = (27, 6.0), xytext = (80,5.1), color = '#707780', fontsize = 25)
    if n == 'Brazil':
        splot.annotate(f"(N: {(int(list(global_df[global_df['local'] == n]['num_producers'])[0])):,})", 
                       xy = (27, 6.0), xytext = (65,6.1), color = '#707780', fontsize = 25)
plt.yticks(fontsize =50)
plt.xlabel("")
plt.ylabel("")
plt.xlim(0, 100)
splot.tick_params(labelsize=30)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 30)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
plt.show()

#### Producers < 10k BRL LTV (all users)

- For this analysis, the 2nd purchase can be from any producer (independent of being below/above the GMV LTV threshold). In this context, "2nd purchase" could refer to any order (1st-2nd, 2nd-3rd, 3rd-4th, etc..)
- N = number of customers

In [None]:
for n, d in enumerate(['below', 'above']):
    plt.subplot(1,2,n+1)
    # get list of all buyers, based on filtered producers
    aux1 = df3[df3['is_below_10_entire'] == d][['user_buyer_id']].drop_duplicates()
    # count repurchase for every customer in the filtered df
    aux2 = df3[df3['is_below_10_entire'] == d]
    aux3 = aux2[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux3.columns = ['user_buyer_id', 'no_repurchase_filter']
    aux1 = aux1.merge(aux3, how = 'left', on = 'user_buyer_id')

    # count repurchase for every customer in the main df
    aux3 = df3[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux3.columns = ['user_buyer_id', 'no_repurchase_nofilter']
    aux1 = aux1.merge(aux3, how = 'left', on = 'user_buyer_id')
    aux1['class'] = aux1.apply(lambda x : 'same_producer_repurchase' if x['no_repurchase_filter'] > 1 
                               else 'no_repurchase' if (x['no_repurchase_nofilter'] == 1)
                               else 'diff_producer_repurchase' if (x['no_repurchase_filter'] == 1) & (x['no_repurchase_nofilter'] > 1)
                               else 'NA', axis = 1
                              )
    aux2 = aux1[['user_buyer_id', 'class']].groupby('class').nunique().reset_index()
    aux2['%'] = aux2['user_buyer_id'] / aux2['user_buyer_id'].sum() * 100  
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'class', x = '%', data = aux2, palette = palette_types, order = ['no_repurchase', 'diff_producer_repurchase', 'same_producer_repurchase']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | N: {(aux2['user_buyer_id'].sum()):,}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 120)
    splot.axes.get_yaxis().set_visible(False)
    if n == 0:
        splot.axes.get_yaxis().set_visible(True)
    if n == 3:
        splot.axes.get_yaxis().set_visible(True)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.2)
plt.show()

#### Producers < 10k BRL LTV (only users who repurchase)

- For this analysis, the 2nd purchase can be from any producer (independent of being below/above the GMV LTV threshold).
- N = number of customers

In [None]:
plt.figure(figsize = (30,10))

for n, d in enumerate(['below', 'above']):
    plt.subplot(1,2,n+1)
    # get list of all buyers, based on filtered producers
    aux1 = df3[df3['is_below_10_entire'] == d][['user_buyer_id']].drop_duplicates()
    # count repurchase for every customer in the filtered df
    aux2 = df3[df3['is_below_10_entire'] == d]
    aux3 = aux2[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux3.columns = ['user_buyer_id', 'no_repurchase_filter']
    aux1 = aux1.merge(aux3, how = 'left', on = 'user_buyer_id')

    # count repurchase for every customer in the main df
    aux3 = df3[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux3.columns = ['user_buyer_id', 'no_repurchase_nofilter']
    aux1 = aux1.merge(aux3, how = 'left', on = 'user_buyer_id')
    aux1['class'] = aux1.apply(lambda x : 'same_producer_repurchase' if x['no_repurchase_filter'] > 1 
                               else 'no_repurchase' if (x['no_repurchase_nofilter'] == 1)
                               else 'diff_producer_repurchase' if (x['no_repurchase_filter'] == 1) & (x['no_repurchase_nofilter'] > 1)
                               else 'NA', axis = 1
                              )
    aux1 = aux1[aux1['class'] != 'no_repurchase']
    aux2 = aux1[['user_buyer_id', 'class']].groupby('class').nunique().reset_index()
    aux2['%'] = aux2['user_buyer_id'] / aux2['user_buyer_id'].sum() * 100  
    
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'class', x = '%', data = aux2, palette = palette_types, order = ['diff_producer_repurchase', 'same_producer_repurchase']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | N: {(aux2['user_buyer_id'].sum()):,}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 120)
    splot.axes.get_yaxis().set_visible(False)
    if n == 0:
        splot.axes.get_yaxis().set_visible(True)
    if n == 3:
        splot.axes.get_yaxis().set_visible(True)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.2)
plt.show()

### [CHECK] How much GMV does a customer reinvest in the same producer (against customers who do not)? (by customer country, producer office, producers below 10k BRL LTV)

#### Overall

In [None]:
# count purchase vol
aux1 = df3[['purchase_id', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).count().reset_index()
aux1['repurchase'] = aux1['purchase_id'].apply(lambda x : 1 if x > 1 else 0)
aux1.columns = ['producer_id', 'user_buyer_id', 'no_repurchase', 'repurchase']
# sum gmv 
aux2 = df3[['gmv_value_brl', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).sum().reset_index()
# calc mean ticket price per producer
aux3 = df3[['producer_id', 'gmv_value_brl']].groupby(['producer_id']).mean().reset_index()
aux3.columns = ['producer_id', 'mean_ticket']
# calc number of products per producer
aux4 = df3[['producer_id', 'product_id']].groupby(['producer_id']).nunique().reset_index()
aux4.columns = ['producer_id', 'num_products']

# merge
aux5 = aux1.merge(aux2, 
                  how = 'left', 
                  on = ['producer_id', 'user_buyer_id']).merge(aux3, 
                                                               how = 'left', on = ['producer_id']).merge(aux4, 
                                                                                                    how = 'left', on = ['producer_id'])
aux5 = aux5[aux5['gmv_value_brl'] > 0]
print(f"99th perc. gmv was removed: {np.percentile(aux5['gmv_value_brl'], 99):,.2f} BRL")
aux5 = aux5[aux5['gmv_value_brl'] < np.percentile(aux5['gmv_value_brl'], 99)]
# get medians
aux4 = aux5[['repurchase', 'gmv_value_brl']].groupby('repurchase').median().reset_index()
aux4['repurchase'] = aux4['repurchase'].apply(lambda x : 'repurchased' if x == 1 else 'did not repurchase')
aux5['ln'] = np.log(aux5['gmv_value_brl'])
aux5 = aux5.merge(df3[['repurchase_same_producer', 'user_buyer_id']].drop_duplicates(), how = 'left', on = 'user_buyer_id')
aux5 = aux5[aux5['repurchase_same_producer'] != 'no_repurchase']
aux5.to_csv('data/gmv_sum_repurchase_by_producer.csv')
# # https://www.statsmodels.org/stable/examples/notebooks/generated/mixed_lm_example.html
# md = smf.mixedlm("np.log(gmv_value_brl) ~ repurchase", aux5, groups=aux5["producer_id"])
# mdf = md.fit(method=["lbfgs"])
# print(mdf.summary())

In [None]:
# sns.boxplot(data = aux5[aux5['gmv_value_brl'] < 3000], x = 'repurchase', y = 'gmv_value_brl');

In [None]:
# get correct % changes: https://davegiles.blogspot.com/2011/03/dummies-for-dummies.html
perc_change = ((math.exp(0.65)- 1) * 100)
factor = (perc_change / 100) + 1

print(f"Repurchase buyers increase gmv by: {perc_change:.2f}% ({factor:.2f}x)")

In [None]:
# gmv
aux6 = aux5.copy()
aux6['repurchase'] = aux6['repurchase'].apply(lambda x : 'repurchased' if x == 1 else 'did not repurchase')
notrep = aux6[aux6['repurchase'] == 'did not repurchase'][['gmv_value_brl']]
notrep.columns = ['did not repurchase']
rep = aux6[aux6['repurchase'] != 'did not repurchase'][['gmv_value_brl']]
rep.columns = ['repurchase']
pd.concat([get_descriptive_statistics(notrep), get_descriptive_statistics(rep)]) 

In [None]:
# min_ci, max_ci = stats.t.interval(0.95, len(notrep)-1, loc=np.mean(notrep), scale=stats.sem(notrep))
# print(f"CI - did not repurchase: {min_ci[0]} {max_ci[0]} ")
# min_ci, max_ci = stats.t.interval(0.95, len(rep)-1, loc=np.mean(rep), scale=stats.sem(rep))
# print(f"CI - repurchase: {min_ci[0]} {max_ci[0]} ")

In [None]:
# plot graph with annotations
plt.figure(figsize = (10,10))
sns.set_style('white')
splot = sns.barplot(y = 'repurchase', x = 'gmv_value_brl', data = aux4, palette = palette_types, order = ['repurchased', 'did not repurchase']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title("Median GMV (USD) ")
plt.yticks(fontsize =16)
plt.xlabel(f"USD")
plt.ylabel("")
plt.xlim(0, 500)
plt.show()

#### Customer country

In [None]:
# get top 10 countries 
top10 = list(df3[['user_country', 'purchase_id']].groupby('user_country').count().reset_index().sort_values('purchase_id', ascending=False)['user_country'][0:10])
plt.figure(figsize = (30,10))

for n, d in enumerate(top10):
    plt.subplot(2,5,n+1)
    # count purchase vol
    aux1 = df3[df3['user_country'] == d]
    aux2 = aux1[['purchase_id', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).count().reset_index()
    aux2['same_prod'] = aux2['purchase_id'].apply(lambda x : 'repurchased' if x > 1 else 'did not repurchase')
    # sum gmv 
    aux3 = aux1[['gmv_value_brl', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).sum().reset_index()
    # merge
    aux3 = aux2.merge(aux3, how = 'left', on = ['producer_id', 'user_buyer_id'])
    # get medians
    aux4 = aux3[['same_prod', 'gmv_value_brl']].groupby('same_prod').median().reset_index()

    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'same_prod', x = 'gmv_value_brl', data = aux4, palette = palette_types, order = ['repurchased', 'did not repurchase']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d}")
    plt.yticks(fontsize =16)
    plt.xlabel(f"Median USD")
    plt.ylabel("")
    plt.xlim(0, 800)
    if n not in (0,5):
        splot.axes.get_yaxis().set_visible(False)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.2)
plt.show()

#### Producer segment

In [None]:
#### plot median
plt.figure(figsize = (30,10))

for n, d in enumerate(['SEED','SMALL', 'MEDIUM', 'LARGE']):
    plt.subplot(2,3,n+1)
    # count purchase vol
    aux1 = df3[df3['segment'] == d]
    aux2 = aux1[['purchase_id', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).count().reset_index()
    aux2['same_prod'] = aux2['purchase_id'].apply(lambda x : 'repurchased' if x > 1 else 'did not repurchase')
    # sum gmv 
    aux3 = aux1[['gmv_value_brl', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).sum().reset_index()
    # merge
    aux3 = aux2.merge(aux3, how = 'left', on = ['producer_id', 'user_buyer_id'])
    # get medians
    aux4 = aux3[['same_prod', 'gmv_value_brl']].groupby('same_prod').median().reset_index()

    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'same_prod', x = 'gmv_value_brl', data = aux4, palette = palette_types, order = ['repurchased', 'did not repurchase']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d}")
    plt.yticks(fontsize =16)
    plt.xlabel(f"Median USD")
    plt.ylabel("")
    plt.xlim(0, 1200)
    splot.axes.get_yaxis().set_visible(False)
    if n == 0:
        splot.axes.get_yaxis().set_visible(True)
    if n == 3:
        splot.axes.get_yaxis().set_visible(True)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.2)
plt.show()

In [None]:
#### REGRESSION

for n, d in enumerate(['SEED','SMALL', 'MEDIUM', 'LARGE']):
    # count purchase vol
    print(d)
    aux1 = df3[df3['segment'] == d].drop(['repurchase', 'no_repurchase', 'repurchase_same_producer'], axis = 1)
    # count purchase vol
    aux2 = aux1[['purchase_id', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).count().reset_index()
    aux2['repurchase'] = aux2['purchase_id'].apply(lambda x : 1 if x > 1 else 0)
    aux2.columns = ['producer_id', 'user_buyer_id', 'no_repurchase', 'repurchase']
    # sum gmv 
    aux3 = aux1[['gmv_value_brl', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).sum().reset_index()
    # calc mean ticket price per producer
    aux4 = aux1[['producer_id', 'gmv_value_brl']].groupby(['producer_id']).mean().reset_index()
    aux4.columns = ['producer_id', 'mean_ticket']
    # calc number of products per producer
    aux5 = aux1[['producer_id', 'product_id']].groupby(['producer_id']).nunique().reset_index()
    aux5.columns = ['producer_id', 'num_products']

    # merge
    aux6 = aux2.merge(aux3, 
                      how = 'left', 
                      on = ['producer_id', 'user_buyer_id']).merge(aux4, 
                                                                   how = 'left', on = ['producer_id']).merge(aux5, 
                                                                                                        how = 'left', on = ['producer_id'])
    aux6 = aux6[aux6['gmv_value_brl'] > 0]
    print(f"99th perc. gmv was removed: {np.percentile(aux6['gmv_value_brl'], 99):,.2f} BRL")
    aux6 = aux6[aux6['gmv_value_brl'] < np.percentile(aux6['gmv_value_brl'], 99)]

    # save
    aux6['ln'] = np.log(aux6['gmv_value_brl'])

    ### count purchases per user
    rep = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    rep.columns = ['user_buyer_id', 'no_repurchase']
    rep = aux1.merge(rep, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    rep['repurchase'] = rep['no_repurchase'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')
    ### flag whether a user repurchased from same producer at least once 
    aux7 = rep[rep['repurchase'] == 'repurchase'][['purchase_id', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).count().reset_index().reset_index()
    aux7['same_prod'] = aux7['purchase_id'].apply(lambda x : 'repurchased' if x > 1 else 'did not repurchase')
    # get users who repurchased from same producer
    aux8 = aux7[aux7['same_prod'] == 'repurchased'][['user_buyer_id']].drop_duplicates()
    aux8['repurchase_same_producer'] = 'same_producer_repurchase'
    # get users who did not repurchased
    rep = rep.merge(aux8, how = 'left', on = 'user_buyer_id')
    rep.loc[rep['repurchase'] == 'single-purchase', 'repurchase_same_producer'] = 'no_repurchase'
    rep['repurchase_same_producer'] = rep['repurchase_same_producer'].fillna('diff_producer_repurchase')
    

    aux6 = aux6.merge(rep[['repurchase_same_producer', 'user_buyer_id']].drop_duplicates(), how = 'left', on = 'user_buyer_id')
    aux6 = aux6[aux6['repurchase_same_producer'] != 'no_repurchase']
    aux6.to_csv(f'data/gmv_sum_repurchase_by_producer_{d}.csv')

In [None]:
# get correct % changes: https://davegiles.blogspot.com/2011/03/dummies-for-dummies.html
dict_country = {
    'Seed':round(((math.exp(0.52)- 1) * 100), 2),
    'Small':round(((math.exp(0.48)- 1) * 100), 2),
    'Medium':round(((math.exp(0.61)- 1) * 100), 2),
    'Large':round(((math.exp(1.30)- 1) * 100), 2),
               }
for n in dict_country.keys():
    print(f"{n} : Changing from buyers who don't repurchase to repurchase buyers increase gmv by {dict_country[n]}%")
# plot perc changes 
aux1 = pd.DataFrame(dict_country.items(), columns = ['local', '%'])
aux1['factor'] = (aux1['%'] / 100) + 1

In [None]:
aux1

In [None]:

plt.figure(figsize = (10,15))
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'local', x = 'factor', data = aux1, palette = {'Seed': '#9EA4AC',
                                                                 'Small': '#9EA4AC',
                                                                 'Medium': '#9EA4AC',
                                                                 'Large': '#9EA4AC'
                                                                      } , order = ['Large', 'Medium', 'Small', 'Seed']);

for p in splot.patches:
    splot.annotate(" %.2f" % p.get_width() + 'x', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = 50, color = '#707780')
# plt.title(f"Generated GMV by buyers who repurchase", color =  '#707780' , fontsize = 40)
splot.set_title(f"How much GMV do buyers who repurchase\n" +  "from the same producer generate\n" + "over buyers who do not?\n" + "(by producer segment)", color =  '#707780' , fontsize = 40, loc = 'left', pad=30)

plt.yticks(fontsize =50)
plt.xlabel("")
plt.ylabel("")
plt.xlim(1, 6)
splot.tick_params(labelsize=30)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 30)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
plt.show()

#### Producer office

In [None]:
#### plot median

# get top 10 countries 
office = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])

plt.figure(figsize = (30,10))

for n, d in enumerate(office):
    plt.subplot(2,3,n+1)
    # count purchase vol
    aux1 = df3[df3['user_office_name'] == d]
    aux2 = aux1[['purchase_id', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).count().reset_index()
    aux2['same_prod'] = aux2['purchase_id'].apply(lambda x : 'repurchased' if x > 1 else 'did not repurchase')
    # sum gmv 
    aux3 = aux1[['gmv_value_brl', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).sum().reset_index()
    # merge
    aux3 = aux2.merge(aux3, how = 'left', on = ['producer_id', 'user_buyer_id'])
    # get medians
    aux4 = aux3[['same_prod', 'gmv_value_brl']].groupby('same_prod').median().reset_index()

    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'same_prod', x = 'gmv_value_brl', data = aux4, palette = palette_types, order = ['repurchased', 'did not repurchase']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d}")
    plt.yticks(fontsize =16)
    plt.xlabel(f"Median USD")
    plt.ylabel("")
    plt.xlim(0, 1200)
    splot.axes.get_yaxis().set_visible(False)
    if n == 0:
        splot.axes.get_yaxis().set_visible(True)
    if n == 3:
        splot.axes.get_yaxis().set_visible(True)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.2)
plt.show()

In [None]:



# get top 10 countries 
office = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])
for n, d in enumerate(office):
    # count purchase vol
    print(d)
    aux1 = df3[df3['user_office_name'] == d].drop(['repurchase', 'no_repurchase', 'repurchase_same_producer'], axis = 1)
    # count purchase vol
    aux2 = aux1[['purchase_id', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).count().reset_index()
    aux2['repurchase'] = aux2['purchase_id'].apply(lambda x : 1 if x > 1 else 0)
    aux2.columns = ['producer_id', 'user_buyer_id', 'no_repurchase', 'repurchase']
    # sum gmv 
    aux3 = aux1[['gmv_value_brl', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).sum().reset_index()
    # calc mean ticket price per producer
    aux4 = aux1[['producer_id', 'gmv_value_brl']].groupby(['producer_id']).mean().reset_index()
    aux4.columns = ['producer_id', 'mean_ticket']
    # calc number of products per producer
    aux5 = aux1[['producer_id', 'product_id']].groupby(['producer_id']).nunique().reset_index()
    aux5.columns = ['producer_id', 'num_products']

    # merge
    aux6 = aux2.merge(aux3, 
                      how = 'left', 
                      on = ['producer_id', 'user_buyer_id']).merge(aux4, 
                                                                   how = 'left', on = ['producer_id']).merge(aux5, 
                                                                                                        how = 'left', on = ['producer_id'])
    aux6 = aux6[aux6['gmv_value_brl'] > 0]
    print(f"99th perc. gmv was removed: {np.percentile(aux6['gmv_value_brl'], 99):,.2f} BRL")
    aux6 = aux6[aux6['gmv_value_brl'] < np.percentile(aux6['gmv_value_brl'], 99)]
    # save
    aux6['ln'] = np.log(aux6['gmv_value_brl'])
    
    ### count purchases per user
    rep = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    rep.columns = ['user_buyer_id', 'no_repurchase']
    rep = aux1.merge(rep, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    rep['repurchase'] = rep['no_repurchase'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')
    ### flag whether a user repurchased from same producer at least once 
    aux7 = rep[rep['repurchase'] == 'repurchase'][['purchase_id', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).count().reset_index().reset_index()
    aux7['same_prod'] = aux7['purchase_id'].apply(lambda x : 'repurchased' if x > 1 else 'did not repurchase')
    # get users who repurchased from same producer
    aux8 = aux7[aux7['same_prod'] == 'repurchased'][['user_buyer_id']].drop_duplicates()
    aux8['repurchase_same_producer'] = 'same_producer_repurchase'
    # get users who did not repurchased
    rep = rep.merge(aux8, how = 'left', on = 'user_buyer_id')
    rep.loc[rep['repurchase'] == 'single-purchase', 'repurchase_same_producer'] = 'no_repurchase'
    rep['repurchase_same_producer'] = rep['repurchase_same_producer'].fillna('diff_producer_repurchase')
    

    aux6 = aux6.merge(rep[['repurchase_same_producer', 'user_buyer_id']].drop_duplicates(), how = 'left', on = 'user_buyer_id')
    aux6 = aux6[aux6['repurchase_same_producer'] != 'no_repurchase']
    aux6.to_csv(f'data/gmv_sum_repurchase_by_producer_{d}.csv')

In [None]:
# get correct % changes: https://davegiles.blogspot.com/2011/03/dummies-for-dummies.html
dict_country = {
    'Global':round(((math.exp(0.65)- 1) * 100), 2),
    'Brazil':round(((math.exp(0.61)- 1) * 100), 2),
    'Colombia':round(((math.exp(0.55)- 1) * 100), 2),
    'Mexico':round(((math.exp(0.58)- 1) * 100), 2),
    'Spain':round(((math.exp(0.66)- 1) * 100), 2),
    'USA':round(((math.exp(0.52)- 1) * 100), 2),
    'Amsterdam':round(((math.exp(0.98)- 1) * 100), 2)
               }
for n in dict_country.keys():
    print(f"{n} : Changing from buyers who don't repurchase to repurchase buyers increase gmv by {dict_country[n]}%")


In [None]:
# plot perc changes 
aux1 = pd.DataFrame(dict_country.items(), columns = ['local', '%'])
aux1['factor'] = (aux1['%'] / 100) + 1
list_order = list(aux1.sort_values('factor', ascending=False)['local'])
list_order.remove('Global')
list_order.insert(0, 'Global')
plt.figure(figsize = (10,15))
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'local', x = 'factor', data = aux1, palette = {'Global': '#EF4E23' , 'Brazil': '#9EA4AC',
                                                                 'Colombia': '#9EA4AC', 'Mexico': '#9EA4AC',
                                                                 'Spain': '#9EA4AC', 'USA': '#9EA4AC', 
                                                                 'Amsterdam': '#9EA4AC'} , order = list_order);

for p in splot.patches:
    splot.annotate(" %.2f" % p.get_width() + 'x', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = 50, color = '#707780')
# plt.title(f"Generated GMV by buyers who repurchase", color =  '#707780' , fontsize = 40)
splot.set_title(f"How much GMV do buyers who repurchase\n" +  "from the same producer generate\n" + "over buyers who do not?\n" + "(by producer office)", color =  '#707780' , fontsize = 40, loc = 'left', pad=30)

plt.yticks(fontsize =50)
plt.xlabel("")
plt.ylabel("")
plt.xlim(0, 6)
splot.tick_params(labelsize=30)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 30)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
plt.show()

#### Producers < 10k BRL LTV

- For this analysis, medians were calculated for each producer GMV LTV cohort. Therefore, "repurchase" means a repurchase from the same producer within a cohort, and "do not repurchase" means a single-purchase within the cohort. (it doesn't consider repurchases between different cohorts)

In [None]:
plt.figure(figsize = (30,10))

for n, d in enumerate(['below', 'above']):
    plt.subplot(1,2,n+1)
    # count purchase vol
    aux1 = df3[df3['is_below_10_entire'] == d]
    aux2 = aux1[['purchase_id', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).count().reset_index()
    aux2['same_prod'] = aux2['purchase_id'].apply(lambda x : 'repurchased' if x > 1 else 'did not repurchase')
    # sum gmv 
    aux3 = aux1[['gmv_value_brl', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).sum().reset_index()
    # merge
    aux3 = aux2.merge(aux3, how = 'left', on = ['producer_id', 'user_buyer_id'])
    # get medians
    aux4 = aux3[['same_prod', 'gmv_value_brl']].groupby('same_prod').median().reset_index()

    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'same_prod', x = 'gmv_value_brl', data = aux4, palette = palette_types, order = ['repurchased', 'did not repurchase']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d}")
    plt.yticks(fontsize =16)
    plt.xlabel(f"Median USD")
    plt.ylabel("")
    plt.xlim(0, 400)
    splot.axes.get_yaxis().set_visible(False)
    if n == 0:
        splot.axes.get_yaxis().set_visible(True)

#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.2)
plt.show()

In [None]:
#### REGRESSION

for n, d in enumerate(['below',  'above']):
    # count purchase vol
    print(d)
    aux1 = df3[df3['is_below_10_entire'] == d].drop(['repurchase', 'no_repurchase', 'repurchase_same_producer'], axis = 1)
    # count purchase vol
    aux2 = aux1[['purchase_id', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).count().reset_index()
    aux2['repurchase'] = aux2['purchase_id'].apply(lambda x : 1 if x > 1 else 0)
    aux2.columns = ['producer_id', 'user_buyer_id', 'no_repurchase', 'repurchase']
    # sum gmv 
    aux3 = aux1[['gmv_value_brl', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).sum().reset_index()
    # calc mean ticket price per producer
    aux4 = aux1[['producer_id', 'gmv_value_brl']].groupby(['producer_id']).mean().reset_index()
    aux4.columns = ['producer_id', 'mean_ticket']
    # calc number of products per producer
    aux5 = aux1[['producer_id', 'product_id']].groupby(['producer_id']).nunique().reset_index()
    aux5.columns = ['producer_id', 'num_products']

    # merge
    aux6 = aux2.merge(aux3, 
                      how = 'left', 
                      on = ['producer_id', 'user_buyer_id']).merge(aux4, 
                                                                   how = 'left', on = ['producer_id']).merge(aux5, 
                                                                                                        how = 'left', on = ['producer_id'])
    aux6 = aux6[aux6['gmv_value_brl'] > 0]
    print(f"99th perc. gmv was removed: {np.percentile(aux6['gmv_value_brl'], 99):,.2f} BRL")
    aux6 = aux6[aux6['gmv_value_brl'] < np.percentile(aux6['gmv_value_brl'], 99)]
    
    # save
    aux6['ln'] = np.log(aux6['gmv_value_brl'])

    ### count purchases per user
    rep = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    rep.columns = ['user_buyer_id', 'no_repurchase']
    rep = aux1.merge(rep, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    rep['repurchase'] = rep['no_repurchase'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')
    ### flag whether a user repurchased from same producer at least once 
    aux7 = rep[rep['repurchase'] == 'repurchase'][['purchase_id', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).count().reset_index().reset_index()
    aux7['same_prod'] = aux7['purchase_id'].apply(lambda x : 'repurchased' if x > 1 else 'did not repurchase')
    # get users who repurchased from same producer
    aux8 = aux7[aux7['same_prod'] == 'repurchased'][['user_buyer_id']].drop_duplicates()
    aux8['repurchase_same_producer'] = 'same_producer_repurchase'
    # get users who did not repurchased
    rep = rep.merge(aux8, how = 'left', on = 'user_buyer_id')
    rep.loc[rep['repurchase'] == 'single-purchase', 'repurchase_same_producer'] = 'no_repurchase'
    rep['repurchase_same_producer'] = rep['repurchase_same_producer'].fillna('diff_producer_repurchase')
    

    aux6 = aux6.merge(rep[['repurchase_same_producer', 'user_buyer_id']].drop_duplicates(), how = 'left', on = 'user_buyer_id')
    aux6 = aux6[aux6['repurchase_same_producer'] != 'no_repurchase']
    aux6.to_csv(f'data/gmv_sum_repurchase_by_producer_PROD_LTV_{d}.csv')

In [None]:
# get correct % changes: https://davegiles.blogspot.com/2011/03/dummies-for-dummies.html
dict_country = {
    'below':round(((math.exp(0.39)- 1) * 100), 2),
    'above':round(((math.exp(0.65)- 1) * 100), 2),
               }
for n in dict_country.keys():
    print(f"{n} : Changing from buyers who don't repurchase to repurchase buyers increase gmv by {dict_country[n]}%")
# plot perc changes 
aux1 = pd.DataFrame(dict_country.items(), columns = ['local', '%'])
aux1['factor'] = (aux1['%'] / 100) + 1

In [None]:

plt.figure(figsize = (10,15))
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'local', x = 'factor', data = aux1, palette = {'below': '#EF4E23',
                                                                 'above': '#9EA4AC'} , order = ['above', 'below']);

for p in splot.patches:
    splot.annotate(" %.2f" % p.get_width() + 'x', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = 50, color = '#707780')
# plt.title(f"Generated GMV by buyers who repurchase", color =  '#707780' , fontsize = 40)
splot.set_title(f"How much GMV do buyers who repurchase\n" +  "from the same producer generate\n" + "over buyers who do not?", color =  '#707780' , fontsize = 40, loc = 'left', pad=30)

plt.yticks(fontsize =50)
plt.xlabel("")
plt.ylabel("")
plt.xlim(1, 4)
splot.tick_params(labelsize=30)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 30)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
plt.show()

### How many customers purchase a 2nd time from a different producer given that 1st purchase was from a producer with a single product? (by customer country, producer office, producers below 10k BRL LTV)

#### Overall

In [None]:
# get only 1st purchase from producers 
aux1 = df3[(df3['repurchase'] == 'repurchase') & (df3['order_purchase'] == 1)][['user_buyer_id', 'producer_id', 'num_products_producer']]
aux1.columns = ['user_buyer_id', 'first_producer_id', 'first_num_products_producer']
# get only 2nd purchase from producers 
aux2 = df3[(df3['repurchase'] == 'repurchase') & (df3['order_purchase'] == 2)][['user_buyer_id', 'producer_id']]

# merge
aux3 = aux2.merge(aux1, how = 'left', on = 'user_buyer_id')

# repurchase is from same producer
aux3['same_producer'] = aux3.apply(lambda x : 'same' if x['first_producer_id'] == x['producer_id'] else 'diff', axis = 1)

# customer repurchased from diff producer and 1st purchase was from producer with single product?
aux3['answer'] = aux3.apply(lambda x : 'yes' if (x['same_producer'] == 'diff') & (x['first_num_products_producer'] == 1) else 'no', axis =1)
aux4 = aux3[['answer', 'same_producer']].groupby('answer').count().reset_index() # get count 
aux4['%'] = aux4['same_producer'] / aux4['same_producer'].sum()*100
# get vol of users whose 1st and 2nd purchase were from same producer
aux5 = aux3[['user_buyer_id', 'same_producer']].groupby('same_producer').count().reset_index()
aux5['%'] = aux5['user_buyer_id'] / aux5['user_buyer_id'].sum()*100

In [None]:
# plot graph with annotations
plt.figure(figsize = (10,10))
sns.set_style('white')
splot = sns.barplot(y = 'same_producer', x = '%', data = aux5, palette = palette_types, order = ['diff', 'same']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"1st and 2nd purchase: same producer?")
plt.yticks(fontsize =16)
plt.xlabel(f"% (N: {aux5['user_buyer_id'].sum():,} users)")
plt.ylabel("")
plt.xlim(0, 120)
plt.show()

In [None]:
# plot graph with annotations
plt.figure(figsize = (10,10))
sns.set_style('white')
splot = sns.barplot(y = 'answer', x = '%', data = aux4, palette = palette_types, order = ['no', 'yes']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"Did customer repurchase from a different producer when 1st purchase was from a producer with a single product?")
plt.yticks(fontsize =16)
plt.xlabel(f"% (N: {aux4['same_producer'].sum():,} users)")
plt.ylabel("")
plt.xlim(0, 120)
plt.show()

#### Customer country (1st and 2nd purchase: same producer?)

In [None]:
# get top 10 countries 
top10 = list(df3[['user_country', 'purchase_id']].groupby('user_country').count().reset_index().sort_values('purchase_id', ascending=False)['user_country'][0:10])
plt.figure(figsize = (35,10))

for n, d in enumerate(top10):
    plt.subplot(2,5,n+1)
    ### count purchases per user
    aux1 = df3[df3['user_country'] == d]
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')
    ### flag whether a user repurchased from same producer at least once 
    aux2 = aux1[aux1['repurchase_temp'] == 'repurchase'][['purchase_id', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).count().reset_index().reset_index()
    aux2['same_prod_temp'] = aux2['purchase_id'].apply(lambda x : 'repurchased' if x > 1 else 'did not repurchase')
    # get users who repurchased from same producer
    aux2 = aux2[aux2['same_prod_temp'] == 'repurchased'][['user_buyer_id']].drop_duplicates()
    aux2['repurchase_same_producer_temp'] = 'same_producer_repurchase'
    # get users who did not repurchased
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    aux1.loc[aux1['repurchase_temp'] == 'single-purchase', 'repurchase_same_producer_temp'] = 'no_repurchase'
    aux1['repurchase_same_producer_temp'] = aux1['repurchase_same_producer_temp'].fillna('diff_producer_repurchase')
    # get rank of purchases
    aux1['order_purchase_temp'] = aux1.groupby("user_buyer_id")["purchase_release_datetime"].rank(method="first", ascending=True)
    # flag producers with a single product
    aux2 = aux1[['product_id', 'producer_id']].groupby('producer_id').nunique().reset_index()
    aux2.columns = ['producer_id', 'num_products_producer_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'producer_id')
    # get only 1st purchase from producers 
    aux2 = aux1[(aux1['repurchase_temp'] == 'repurchase') & (aux1['order_purchase_temp'] == 1)][['user_buyer_id', 'producer_id', 'num_products_producer_temp']]
    aux2.columns = ['user_buyer_id', 'first_producer_id', 'first_num_products_producer']
    # get only 2nd purchase from producers 
    aux3 = aux1[(aux1['repurchase_temp'] == 'repurchase') & (aux1['order_purchase_temp'] == 2)][['user_buyer_id', 'producer_id']]
    # merge
    aux3 = aux3.merge(aux2, how = 'left', on = 'user_buyer_id')

    # repurchase is from same producer
    aux3['same_producer'] = aux3.apply(lambda x : 'same' if x['first_producer_id'] == x['producer_id'] else 'diff', axis = 1)

    # customer repurchased from diff producer and 1st purchase was from producer with single product?
    aux3['answer'] = aux3.apply(lambda x : 'yes' if (x['same_producer'] == 'diff') & (x['first_num_products_producer'] == 1) else 'no', axis =1)
    aux4 = aux3[['answer', 'same_producer']].groupby('answer').count().reset_index() # get count 
    aux4['%'] = aux4['same_producer'] / aux4['same_producer'].sum()*100  
    # get vol of users whose 1st and 2nd purchase were from same producer
    aux5 = aux3[['user_buyer_id', 'same_producer']].groupby('same_producer').count().reset_index()
    aux5['%'] = aux5['user_buyer_id'] / aux5['user_buyer_id'].sum()*100
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'same_producer', x = '%', data = aux5, palette = palette_types, order = ['same', 'diff']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | (N: {aux5['user_buyer_id'].sum():,} users)")
    plt.yticks(fontsize =16)
    plt.xlabel(f"%")
    plt.ylabel("")
    plt.xlim(0, 120)
    if n not in (0,5):
        splot.axes.get_yaxis().set_visible(False)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.5)
plt.show()

#### Customer country (answer)

In [None]:
# get top 10 countries 
top10 = list(df3[['user_country', 'purchase_id']].groupby('user_country').count().reset_index().sort_values('purchase_id', ascending=False)['user_country'][0:10])
plt.figure(figsize = (35,10))

for n, d in enumerate(top10):
    plt.subplot(2,5,n+1)
    ### count purchases per user
    aux1 = df3[df3['user_country'] == d]
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')
    ### flag whether a user repurchased from same producer at least once 
    aux2 = aux1[aux1['repurchase_temp'] == 'repurchase'][['purchase_id', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).count().reset_index().reset_index()
    aux2['same_prod_temp'] = aux2['purchase_id'].apply(lambda x : 'repurchased' if x > 1 else 'did not repurchase')
    # get users who repurchased from same producer
    aux2 = aux2[aux2['same_prod_temp'] == 'repurchased'][['user_buyer_id']].drop_duplicates()
    aux2['repurchase_same_producer_temp'] = 'same_producer_repurchase'
    # get users who did not repurchased
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    aux1.loc[aux1['repurchase_temp'] == 'single-purchase', 'repurchase_same_producer_temp'] = 'no_repurchase'
    aux1['repurchase_same_producer_temp'] = aux1['repurchase_same_producer_temp'].fillna('diff_producer_repurchase')
    # get rank of purchases
    aux1['order_purchase_temp'] = aux1.groupby("user_buyer_id")["purchase_release_datetime"].rank(method="first", ascending=True)
    # flag producers with a single product
    aux2 = aux1[['product_id', 'producer_id']].groupby('producer_id').nunique().reset_index()
    aux2.columns = ['producer_id', 'num_products_producer_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'producer_id')
    # get only 1st purchase from producers 
    aux2 = aux1[(aux1['repurchase_temp'] == 'repurchase') & (aux1['order_purchase_temp'] == 1)][['user_buyer_id', 'producer_id', 'num_products_producer_temp']]
    aux2.columns = ['user_buyer_id', 'first_producer_id', 'first_num_products_producer']
    # get only 2nd purchase from producers 
    aux3 = aux1[(aux1['repurchase_temp'] == 'repurchase') & (aux1['order_purchase_temp'] == 2)][['user_buyer_id', 'producer_id']]
    # merge
    aux3 = aux3.merge(aux2, how = 'left', on = 'user_buyer_id')
    # repurchase is from same producer
    aux3['same_producer'] = aux3.apply(lambda x : 'same' if x['first_producer_id'] == x['producer_id'] else 'diff', axis = 1)
    # customer repurchased from diff producer and 1st purchase was from producer with single product?
    aux3['answer'] = aux3.apply(lambda x : 'yes' if (x['same_producer'] == 'diff') & (x['first_num_products_producer'] == 1) else 'no', axis =1)
    aux4 = aux3[['answer', 'same_producer']].groupby('answer').count().reset_index() # get count 
    aux4['%'] = aux4['same_producer'] / aux4['same_producer'].sum()*100  
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'answer', x = '%', data = aux4, palette = palette_types, order = ['no', 'yes']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | (N: {aux4['same_producer'].sum():,} users)")
    plt.yticks(fontsize =16)
    plt.xlabel(f"% ")
    plt.ylabel("")
    plt.xlim(0, 140)
    if n not in (0,5):
        splot.axes.get_yaxis().set_visible(False)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.5)
plt.show()

#### Producer office (1st and 2nd purchase: same producer?)

- For this analysis, 1st and 2nd purchases can be from any producer within the same office. 
- Premise: if a 2nd purchase was from a producer from a different office, this purchase will be discarded. If the 3rd+ purchase comes from the same producer office, then it will be considered as the 2nd purchase.

In [None]:
# get office
office = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])
plt.figure(figsize = (30,10))

for n, d in enumerate(office):
    plt.subplot(2,3,n+1)
    ### count purchases per user
    aux1 = df3[df3['user_office_name'] == d]
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')
    ### flag whether a user repurchased from same producer at least once 
    aux2 = aux1[aux1['repurchase_temp'] == 'repurchase'][['purchase_id', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).count().reset_index().reset_index()
    aux2['same_prod_temp'] = aux2['purchase_id'].apply(lambda x : 'repurchased' if x > 1 else 'did not repurchase')
    # get users who repurchased from same producer
    aux2 = aux2[aux2['same_prod_temp'] == 'repurchased'][['user_buyer_id']].drop_duplicates()
    aux2['repurchase_same_producer_temp'] = 'same_producer_repurchase'
    # get users who did not repurchased
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    aux1.loc[aux1['repurchase_temp'] == 'single-purchase', 'repurchase_same_producer_temp'] = 'no_repurchase'
    aux1['repurchase_same_producer_temp'] = aux1['repurchase_same_producer_temp'].fillna('diff_producer_repurchase')
    # get rank of purchases
    aux1['order_purchase_temp'] = aux1.groupby("user_buyer_id")["purchase_release_datetime"].rank(method="first", ascending=True)
    # flag producers with a single product
    aux2 = aux1[['product_id', 'producer_id']].groupby('producer_id').nunique().reset_index()
    aux2.columns = ['producer_id', 'num_products_producer_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'producer_id')
    # get only 1st purchase from producers 
    aux2 = aux1[(aux1['repurchase_temp'] == 'repurchase') & (aux1['order_purchase_temp'] == 1)][['user_buyer_id', 'producer_id', 'num_products_producer_temp']]
    aux2.columns = ['user_buyer_id', 'first_producer_id', 'first_num_products_producer']
    # get only 2nd purchase from producers 
    aux3 = aux1[(aux1['repurchase_temp'] == 'repurchase') & (aux1['order_purchase_temp'] == 2)][['user_buyer_id', 'producer_id']]
    # merge
    aux3 = aux3.merge(aux2, how = 'left', on = 'user_buyer_id')

    # repurchase is from same producer
    aux3['same_producer'] = aux3.apply(lambda x : 'same' if x['first_producer_id'] == x['producer_id'] else 'diff', axis = 1)

    # customer repurchased from diff producer and 1st purchase was from producer with single product?
    aux3['answer'] = aux3.apply(lambda x : 'yes' if (x['same_producer'] == 'diff') & (x['first_num_products_producer'] == 1) else 'no', axis =1)
    aux4 = aux3[['answer', 'same_producer']].groupby('answer').count().reset_index() # get count 
    aux4['%'] = aux4['same_producer'] / aux4['same_producer'].sum()*100  
    # get vol of users whose 1st and 2nd purchase were from same producer
    aux5 = aux3[['user_buyer_id', 'same_producer']].groupby('same_producer').count().reset_index()
    aux5['%'] = aux5['user_buyer_id'] / aux5['user_buyer_id'].sum()*100
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'same_producer', x = '%', data = aux5, palette = palette_types, order = ['same', 'diff']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | (N: {aux5['user_buyer_id'].sum():,} users)")
    plt.yticks(fontsize =16)
    plt.xlabel(f"% ")
    plt.ylabel("")
    plt.xlim(0, 140)
    splot.axes.get_yaxis().set_visible(False)
    if n == 0:
        splot.axes.get_yaxis().set_visible(True)
    if n == 3:
        splot.axes.get_yaxis().set_visible(True)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.2)
plt.show()

#### Producer office (answer)

- For this analysis, 1st and 2nd purchases can be from any producer within the same office.
- Premise: if a 2nd purchase was from a producer from a different office, this purchase will be discarded. If the 3rd+ purchase comes from the same producer office, then it will be considered as the 2nd purchase.

In [None]:
# get office
office = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])
plt.figure(figsize = (30,10))

for n, d in enumerate(office):
    plt.subplot(2,3,n+1)
    ### count purchases per user
    aux1 = df3[df3['user_office_name'] == d]
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')
    ### flag whether a user repurchased from same producer at least once 
    aux2 = aux1[aux1['repurchase_temp'] == 'repurchase'][['purchase_id', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).count().reset_index().reset_index()
    aux2['same_prod_temp'] = aux2['purchase_id'].apply(lambda x : 'repurchased' if x > 1 else 'did not repurchase')
    # get users who repurchased from same producer
    aux2 = aux2[aux2['same_prod_temp'] == 'repurchased'][['user_buyer_id']].drop_duplicates()
    aux2['repurchase_same_producer_temp'] = 'same_producer_repurchase'
    # get users who did not repurchased
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    aux1.loc[aux1['repurchase_temp'] == 'single-purchase', 'repurchase_same_producer_temp'] = 'no_repurchase'
    aux1['repurchase_same_producer_temp'] = aux1['repurchase_same_producer_temp'].fillna('diff_producer_repurchase')
    # get rank of purchases
    aux1['order_purchase_temp'] = aux1.groupby("user_buyer_id")["purchase_release_datetime"].rank(method="first", ascending=True)
    # flag producers with a single product
    aux2 = aux1[['product_id', 'producer_id']].groupby('producer_id').nunique().reset_index()
    aux2.columns = ['producer_id', 'num_products_producer_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'producer_id')
    # get only 1st purchase from producers 
    aux2 = aux1[(aux1['repurchase_temp'] == 'repurchase') & (aux1['order_purchase_temp'] == 1)][['user_buyer_id', 'producer_id', 'num_products_producer_temp']]
    aux2.columns = ['user_buyer_id', 'first_producer_id', 'first_num_products_producer']
    # get only 2nd purchase from producers 
    aux3 = aux1[(aux1['repurchase_temp'] == 'repurchase') & (aux1['order_purchase_temp'] == 2)][['user_buyer_id', 'producer_id']]
    # merge
    aux3 = aux3.merge(aux2, how = 'left', on = 'user_buyer_id')
    # repurchase is from same producer
    aux3['same_producer'] = aux3.apply(lambda x : 'same' if x['first_producer_id'] == x['producer_id'] else 'diff', axis = 1)
    # customer repurchased from diff producer and 1st purchase was from producer with single product?
    aux3['answer'] = aux3.apply(lambda x : 'yes' if (x['same_producer'] == 'diff') & (x['first_num_products_producer'] == 1) else 'no', axis =1)
    aux4 = aux3[['answer', 'same_producer']].groupby('answer').count().reset_index() # get count 
    aux4['%'] = aux4['same_producer'] / aux4['same_producer'].sum()*100  
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'answer', x = '%', data = aux4, palette = palette_types, order = ['no', 'yes']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | (N: {aux4['same_producer'].sum():,} users)")
    plt.yticks(fontsize =16)
    plt.xlabel(f"% ")
    plt.ylabel("")
    plt.xlim(0, 120)
    splot.axes.get_yaxis().set_visible(False)
    if n == 0:
        splot.axes.get_yaxis().set_visible(True)
    if n == 3:
        splot.axes.get_yaxis().set_visible(True)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.2)
plt.show()

#### Producers < 10k BRL LTV (1st and 2nd purchase: same producer?)

- For this analysis, the 2nd purchase can be from any producer (independent of being below/above the GMV LTV threshold). In this context, we are not considering 3rd, 4th+ purchases

In [None]:
plt.figure(figsize = (30,10))

for n, d in enumerate(['below', 'above']):
    plt.subplot(1,2,n+1)
    ### count purchases per user
    aux0 = df3[(df3['is_below_10_entire'] == d) & (df3['order_purchase'] == 1)][['user_buyer_id']] 
    aux1 = df3.merge(aux0, how = 'inner', on = 'user_buyer_id')
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x > 1 else 'single-purchase')
    ### flag whether a user repurchased from same producer at least once 
    aux2 = aux1[aux1['repurchase_temp'] == 'repurchase'][['purchase_id', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).count().reset_index().reset_index()
    aux2['same_prod_temp'] = aux2['purchase_id'].apply(lambda x : 'repurchased' if x > 1 else 'did not repurchase')
    # get users who repurchased from same producer
    aux2 = aux2[aux2['same_prod_temp'] == 'repurchased'][['user_buyer_id']].drop_duplicates()
    aux2['repurchase_same_producer_temp'] = 'same_producer_repurchase'
    # get users who did not repurchased
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    aux1.loc[aux1['repurchase_temp'] == 'single-purchase', 'repurchase_same_producer_temp'] = 'no_repurchase'
    aux1['repurchase_same_producer_temp'] = aux1['repurchase_same_producer_temp'].fillna('diff_producer_repurchase')
    # get rank of purchases
    aux1['order_purchase_temp'] = aux1.groupby("user_buyer_id")["purchase_release_datetime"].rank(method="first", ascending=True)
    # flag producers with a single product
    aux2 = df3[['product_id', 'producer_id']].groupby('producer_id').nunique().reset_index()
    aux2.columns = ['producer_id', 'num_products_producer_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'producer_id')
    # get only 1st purchase from producers 
    aux2 = aux1[(aux1['repurchase_temp'] == 'repurchase') & (aux1['order_purchase_temp'] == 1) & (aux1['is_below_10_entire'] == d)][['user_buyer_id', 'producer_id', 'num_products_producer_temp']]
    aux2.columns = ['user_buyer_id', 'first_producer_id', 'first_num_products_producer']
    # get only 2nd purchase from producers 
    aux3 = aux1[(aux1['repurchase_temp'] == 'repurchase') & (aux1['order_purchase_temp'] == 2)][['user_buyer_id', 'producer_id']]
    # merge
    aux3 = aux3.merge(aux2, how = 'left', on = 'user_buyer_id')

    # repurchase is from same producer
    aux3['same_producer'] = aux3.apply(lambda x : 'same' if x['first_producer_id'] == x['producer_id'] else 'diff', axis = 1)

    # customer repurchased from diff producer and 1st purchase was from producer with single product?
    aux3['answer'] = aux3.apply(lambda x : 'yes' if (x['same_producer'] == 'diff') & (x['first_num_products_producer'] == 1) else 'no', axis =1)
    aux4 = aux3[['answer', 'same_producer']].groupby('answer').count().reset_index() # get count 
    aux4['%'] = aux4['same_producer'] / aux4['same_producer'].sum()*100  
    # get vol of users whose 1st and 2nd purchase were from same producer
    aux5 = aux3[['user_buyer_id', 'same_producer']].groupby('same_producer').count().reset_index()
    aux5['%'] = aux5['user_buyer_id'] / aux5['user_buyer_id'].sum()*100
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'same_producer', x = '%', data = aux5, palette = palette_types, order = ['same', 'diff']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | (N: {aux5['user_buyer_id'].sum():,} users)")
    plt.yticks(fontsize =16)
    plt.xlabel(f"% ")
    plt.ylabel("")
    plt.xlim(0, 140)
    splot.axes.get_yaxis().set_visible(False)
    if n == 0:
        splot.axes.get_yaxis().set_visible(True)
    if n == 3:
        splot.axes.get_yaxis().set_visible(True)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.2)
plt.show()

####  Producers < 10k BRL LTV (answer)

- For this analysis, the 2nd purchase can be from any producer (independent of being below/above the GMV LTV threshold). In this context, we are not considering 3rd, 4th+ purchases

In [None]:
plt.figure(figsize = (30,10))

for n, d in enumerate(['below', 'above']):
    plt.subplot(1,2,n+1)
    ### count purchases per user
    aux0 = df3[(df3['is_below_10_entire'] == d) & (df3['order_purchase'] == 1)][['user_buyer_id']] 
    aux1 = df3.merge(aux0, how = 'inner', on = 'user_buyer_id')
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x > 1 else 'single-purchase')
    ### flag whether a user repurchased from same producer at least once 
    aux2 = aux1[aux1['repurchase_temp'] == 'repurchase'][['purchase_id', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).count().reset_index().reset_index()
    aux2['same_prod_temp'] = aux2['purchase_id'].apply(lambda x : 'repurchased' if x > 1 else 'did not repurchase')
    # get users who repurchased from same producer
    aux2 = aux2[aux2['same_prod_temp'] == 'repurchased'][['user_buyer_id']].drop_duplicates()
    aux2['repurchase_same_producer_temp'] = 'same_producer_repurchase'
    # get users who did not repurchased
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    aux1.loc[aux1['repurchase_temp'] == 'single-purchase', 'repurchase_same_producer_temp'] = 'no_repurchase'
    aux1['repurchase_same_producer_temp'] = aux1['repurchase_same_producer_temp'].fillna('diff_producer_repurchase')
    # get rank of purchases
    aux1['order_purchase_temp'] = aux1.groupby("user_buyer_id")["purchase_release_datetime"].rank(method="first", ascending=True)
    # flag producers with a single product
    aux2 = df3[['product_id', 'producer_id']].groupby('producer_id').nunique().reset_index()
    aux2.columns = ['producer_id', 'num_products_producer_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'producer_id')
    # get only 1st purchase from producers 
    aux2 = aux1[(aux1['repurchase_temp'] == 'repurchase') & (aux1['order_purchase_temp'] == 1) & (aux1['is_below_10_entire'] == d)][['user_buyer_id', 'producer_id', 'num_products_producer_temp']]
    aux2.columns = ['user_buyer_id', 'first_producer_id', 'first_num_products_producer']
    # get only 2nd purchase from producers 
    aux3 = aux1[(aux1['repurchase_temp'] == 'repurchase') & (aux1['order_purchase_temp'] == 2)][['user_buyer_id', 'producer_id']]
    # merge
    aux3 = aux3.merge(aux2, how = 'left', on = 'user_buyer_id')

    # repurchase is from same producer
    aux3['same_producer'] = aux3.apply(lambda x : 'same' if x['first_producer_id'] == x['producer_id'] else 'diff', axis = 1)

    # customer repurchased from diff producer and 1st purchase was from producer with single product?
    aux3['answer'] = aux3.apply(lambda x : 'yes' if (x['same_producer'] == 'diff') & (x['first_num_products_producer'] == 1) else 'no', axis =1)
    aux4 = aux3[['answer', 'same_producer']].groupby('answer').count().reset_index() # get count 
    aux4['%'] = aux4['same_producer'] / aux4['same_producer'].sum()*100  
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'answer', x = '%', data = aux4, palette = palette_types, order = ['no', 'yes']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | (N: {aux4['same_producer'].sum():,} users)")
    plt.yticks(fontsize =16)
    plt.xlabel(f"% ")
    plt.ylabel("")
    plt.xlim(0, 120)
    splot.axes.get_yaxis().set_visible(False)
    if n == 0:
        splot.axes.get_yaxis().set_visible(True)
    if n == 3:
        splot.axes.get_yaxis().set_visible(True)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.2)
plt.show()

### What is the reorder rate within the same market category? (by customer country, producer office, producer segment, producers below 10k BRL LTV)

Reorder Rate = % of customers who repurchase

#### Overall

In [None]:
# calculate repurchase
category = list(df3[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'])
reorder = pd.DataFrame()
for n, d in enumerate(category):
    ### count purchases per user
    aux1 = df3[df3['category_cleaned'] == d]
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x > 1 else 'single-purchase')    
    
    # get vol of users with purchase
    aux1 = aux1[['no_repurchase_temp','repurchase_temp',  'user_buyer_id']].groupby(['repurchase_temp','no_repurchase_temp']).nunique().reset_index()
    aux1['%'] = aux1['user_buyer_id'] / aux1['user_buyer_id'].sum() * 100
    aux1 = aux1.sort_values('%', ascending=False)
    # get users who repurchase
    aux3 = aux1[['repurchase_temp', 'user_buyer_id']].groupby('repurchase_temp').sum().reset_index()
    aux3['%'] = aux3['user_buyer_id'] / aux3['user_buyer_id'].sum() * 100
    aux4 = aux3[aux3['repurchase_temp'] == 'repurchase'][['%']]
    aux4['category_cleaned'] = d
    aux4 = aux4[['category_cleaned', '%']]
    reorder = reorder.append(aux4)

# plot graph with annotations
reorder = reorder.sort_values('%', ascending=False)

plt.figure(figsize = (15,10))

sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = reorder, palette = palette_topics, order = reorder.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"Product Category: Reorder Rate (%)")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 80)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(wspace=0.2)
plt.show()

In [None]:
# calculate gmv captured by users who repurchase
category = list(df3[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'])
share_gmv = pd.DataFrame()
for n, d in enumerate(category):
    ### count purchases per user
    aux1 = df3[df3['category_cleaned'] == d]
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x > 1 else 'single-purchase')    
    aux2 = aux1[['repurchase_temp', 'gmv_value_brl']].groupby('repurchase_temp').sum().reset_index()
    aux2['gmv_share_repurchase'] = aux2['gmv_value_brl'] / aux2['gmv_value_brl'].sum() * 100
    aux2 = aux2[aux2['repurchase_temp'] == 'repurchase']
    aux2['category_cleaned'] = d
    aux2.columns = ['repurchase', 'gmv_value_brl_repurchase', 'gmv_share_repurchase', 'category_cleaned']
    aux2 = aux2[['category_cleaned', 'gmv_share_repurchase', 'gmv_value_brl_repurchase']]
    share_gmv = share_gmv.append(aux2)

# plot graph with annotations
share_gmv = share_gmv.sort_values('gmv_share_repurchase', ascending=False)
plt.figure(figsize = (15,10))

sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'gmv_share_repurchase', data = share_gmv, palette = palette_topics, order = share_gmv.sort_values('gmv_share_repurchase', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"Product Category: % GMV captured by users who repurchase (%)")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
plt.xlim(0, 80)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(wspace=0.2)
plt.show()

In [None]:
# calculate median gmv for repurchase cohort
category = list(df3[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'])
median_gmv = pd.DataFrame()
for n, d in enumerate(category):
    ### count purchases per user
    aux1 = df3[df3['category_cleaned'] == d]
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x > 1 else 'single-purchase')
    aux2 = aux1[['user_buyer_id', 'repurchase_temp', 'gmv_value_brl']].groupby(['user_buyer_id', 'repurchase_temp']).sum().reset_index()
    aux2 = aux2[aux2['gmv_value_brl'] > 0]
    aux2 = aux2[['repurchase_temp', 'gmv_value_brl']].groupby('repurchase_temp').median().reset_index()
    aux2 = aux2[aux2['repurchase_temp'] == 'repurchase']
    aux2.columns = ['repurchase', 'median_gmv_repurchase']
    aux2['category_cleaned'] = d
    aux2 = aux2[['category_cleaned', 'median_gmv_repurchase']]
    median_gmv = median_gmv.append(aux2)

# plot graph with annotations
median_gmv = median_gmv.sort_values('median_gmv_repurchase', ascending=False)
plt.figure(figsize = (15,10))

sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'median_gmv_repurchase', data = median_gmv, palette = palette_topics, order = median_gmv.sort_values('median_gmv_repurchase', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title(f"Product Category: Median GMV by users who repurchase (%)")
plt.yticks(fontsize =16)
plt.xlabel("%")
plt.ylabel("")
# plt.xlim(0, 250)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(wspace=0.2)
plt.show()

In [None]:
# calculate overall ticket median
category = list(df3[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'])
overall_gmv = pd.DataFrame()
for n, d in enumerate(category):
    ### count purchases per user
    aux1 = df3[df3['category_cleaned'] == d][['user_buyer_id', 'gmv_value_brl']].groupby(['user_buyer_id']).sum().reset_index()
    aux1 = aux1[aux1['gmv_value_brl'] > 0]
    aux2 = pd.DataFrame({'category_cleaned' : [d], 'overall_median_gmv': aux1['gmv_value_brl'].median()})
    overall_gmv = pd.concat([overall_gmv, aux2])

In [None]:
# set main dataset
reorder = reorder.merge(share_gmv, 
                        how = 'left', 
                        on = 'category_cleaned').merge(median_gmv, 
                                                    how = 'left', 
                                                    on = 'category_cleaned').merge(overall_gmv, 
                                                                        how = 'left', 
                                                                        on = 'category_cleaned').sort_values('%', ascending=False)
reorder['median_lower'] = reorder.apply(lambda x: 1 if x['median_gmv_repurchase'] < x['overall_median_gmv'] else 0, axis = 1)
reorder['diff_median'] = (reorder['median_gmv_repurchase'] - reorder['overall_median_gmv']) / reorder['overall_median_gmv'] * 100
reorder['diff_median_abs'] = abs(reorder['diff_median'])

In [None]:
reorder[['gmv_share_repurchase', 'diff_median']].corr()

In [None]:
sns.lmplot(x ='gmv_share_repurchase', y = 'diff_median', data = reorder);

In [None]:
# # set palette
# palette_categories_analysis = list()
# for n in list(reorder['category_cleaned']):
#     if int(reorder[reorder['category_cleaned'] == n]['median_lower']) == 1:
#         palette_categories_analysis = palette_categories_analysis + ['r']
#     else:
#         palette_categories_analysis = palette_categories_analysis + ['g']


In [None]:
# set palette
palette_categories_analysis = list()
for n in list(reorder['category_cleaned']):
        palette_categories_analysis = palette_categories_analysis + ['#EF4E23']

In [None]:
fonttitle = 45
annotate_size = 45
plt.figure(figsize = (35,40))
plt.subplot(141)
# plot graph with annotations

# '#707780' (grey)

sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = reorder, palette = palette_categories_analysis, order = reorder.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate(" %.1f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Reorder Rate (%)\n" + " \n" + " ", color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 50)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 30)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.xlabel(f"{aux3['user_buyer_id'].sum():,}", color = '#707780', fontsize = 30)

plt.subplot(142)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'gmv_share_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for p in splot.patches:
    splot.annotate(" %.1f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Generated GMV (%)\n" + "from buyers\n" + "who repurchase " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 50)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)


plt.subplot(143)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'overall_median_gmv', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for p in splot.patches:
    splot.annotate(" %.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Global Avg. Ticket\n" + "Price (BRL)\n" + " " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 600)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)

plt.subplot(144)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'median_gmv_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for n, p in zip(reorder['diff_median'], splot.patches):
    splot.annotate(" %.0f" % p.get_width() + ' ' + "(%.1f p.p)" % n, xy=(p.get_width(), 
                                                                      p.get_y()+p.get_height()/2),xytext=(5,0), 
    textcoords='offset points', ha='left', va='center', fontsize = annotate_size, color = '#707780')

splot.set_title(f"Avg. Ticket Price (BRL)\n" + "from buyers who repurchase\n" + "(% diff from global ticket price)" , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 600)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(hspace=0.2)

plt.subplots_adjust(wspace=0.05)
plt.show()

#### Customer Country

In [None]:
prov = df3[df3['category_cleaned'] != 'SPARKLE']
# get list
category = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'])
# get top 10 countries (buyer country)
top10 = list(prov[['user_country', 'purchase_id']].groupby('user_country').count().reset_index().sort_values('purchase_id', ascending=False)['user_country'][0:10])


plt.figure(figsize = (20,20))
for e, f in enumerate(top10):
    plt.subplot(2,5,e+1)
    reorder = pd.DataFrame()
    aux0 = prov[prov['user_country'] == f]
    for n, d in enumerate(category):
        ### count purchases per user
        aux1 = aux0[aux0['category_cleaned'] == d]
        aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
        aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
        aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
        ### flag repurchase
        aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x > 1 else 'single-purchase')    

        # get vol of users with purchase
        aux1 = aux1[['no_repurchase_temp','repurchase_temp',  'user_buyer_id']].groupby(['repurchase_temp','no_repurchase_temp']).nunique().reset_index()
        aux1['%'] = aux1['user_buyer_id'] / aux1['user_buyer_id'].sum() * 100
        aux1 = aux1.sort_values('%', ascending=False)
        # get users who repurchase
        aux3 = aux1[['repurchase_temp', 'user_buyer_id']].groupby('repurchase_temp').sum().reset_index()
        aux3['%'] = aux3['user_buyer_id'] / aux3['user_buyer_id'].sum() * 100
        aux4 = aux3[aux3['repurchase_temp'] == 'repurchase'][['%']]
        aux4['category_cleaned'] = d
        aux4 = aux4[['category_cleaned', '%']]
        reorder = reorder.append(aux4)
    for g in category:
        if len(reorder[reorder['category_cleaned'] == g]) == 0:
            reorder = reorder.append({'category_cleaned': g, '%': 0}, ignore_index=True)
    # plot graph with annotations
    reorder = reorder.sort_values('%', ascending=False)
    reorder['hex'] = palette_topics[:-1]
    aux1 = pd.DataFrame(category, columns =['category_cleaned'])
    aux1 = aux1.merge(reorder, how = 'left', on = 'category_cleaned')
    sns.set_style('white')
    splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = list(aux1['hex']), order = category);
    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{f}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 100)
    splot.axes.get_yaxis().set_visible(False)
    if e == 0:
        splot.axes.get_yaxis().set_visible(True)
    if e == 5:
        splot.axes.get_yaxis().set_visible(True)
    #         splot.spines['right'].set_visible(False)
    #         splot.spines['top'].set_visible(False)
    #         splot.spines['left'].set_visible(False)
    #         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
    #         splot.axes.get_yaxis().set_visible(False)
plt.subplots_adjust(hspace=0.1)
plt.subplots_adjust(wspace=0.1)
plt.show()

#### Producer Office

- For this analysis, repurchases can be related to any producer within the same office. 
- Premise: if a repurchase was related to a producer from a different office, this purchase will be discarded. 

In [None]:
prov = df3[df3['category_cleaned'] != 'SPARKLE']
# get list
category = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'])
office = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])

plt.figure(figsize = (15,20))
for e, f in enumerate(office):
    plt.subplot(2,3,e+1)
    aux0 = prov[prov['user_office_name'] == f]
    aux1 = aux0[['purchase_id', 'category_cleaned']].groupby('category_cleaned').count().reset_index()
    aux1['%'] = aux1['purchase_id'] / aux1['purchase_id'].sum() * 100
    for g in category:
        if len(aux1[aux1['category_cleaned'] == g]) == 0:
            aux1 = aux1.append({'category_cleaned': g, 'purchase_id': 0, '%': 0}, ignore_index=True)
    # plot graph with annotations
    aux1 = aux1.sort_values('%', ascending=False)
    aux1['hex'] = palette_topics[:-1]
    aux2 = pd.DataFrame(category, columns =['category_cleaned'])
    aux2 = aux2.merge(aux1, how = 'left', on = 'category_cleaned')
    sns.set_style('white')
    splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux2, palette = list(aux2['hex']), order = category);
    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center', color = '#707780')
    plt.title(f"{f} | {aux1['purchase_id'].sum():,}", color =  '#707780')
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 50)
    splot.axes.get_yaxis().set_visible(False)
    if e == 0:
        splot.axes.get_yaxis().set_visible(True)
    if e == 3:
        splot.axes.get_yaxis().set_visible(True)
    if e == 4:
        plt.xlabel("Num of Purchases (%)")
    if e != 4:
        splot.axes.get_xaxis().set_visible(False)
    #         splot.spines['right'].set_visible(False)
    #         splot.spines['top'].set_visible(False)
    #         splot.spines['left'].set_visible(False)
    #         splot.spines['bottom'].set_visible(False)
     
    #         splot.axes.get_yaxis().set_visible(False)
plt.subplots_adjust(hspace=0.1)
plt.subplots_adjust(wspace=0.1)
plt.show()

In [None]:
prov = df3[df3['category_cleaned'] != 'SPARKLE']
# get list
category = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'])
office = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])

plt.figure(figsize = (15,20))
for e, f in enumerate(office):
    plt.subplot(2,3,e+1)
    reorder = pd.DataFrame()
    aux0 = prov[prov['user_office_name'] == f]
    for n, d in enumerate(category):
        ### count purchases per user
        aux1 = aux0[aux0['category_cleaned'] == d]
        aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
        aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
        aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
        ### flag repurchase
        aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x > 1 else 'single-purchase')    

        # get vol of users with purchase
        aux1 = aux1[['no_repurchase_temp','repurchase_temp',  'user_buyer_id']].groupby(['repurchase_temp','no_repurchase_temp']).nunique().reset_index()
        aux1['%'] = aux1['user_buyer_id'] / aux1['user_buyer_id'].sum() * 100
        aux1 = aux1.sort_values('%', ascending=False)
        # get users who repurchase
        aux3 = aux1[['repurchase_temp', 'user_buyer_id']].groupby('repurchase_temp').sum().reset_index()
        aux3['%'] = aux3['user_buyer_id'] / aux3['user_buyer_id'].sum() * 100
        aux4 = aux3[aux3['repurchase_temp'] == 'repurchase'][['%']]
        aux4['category_cleaned'] = d
        aux4 = aux4[['category_cleaned', '%']]
        reorder = reorder.append(aux4)
    for g in category:
        if len(reorder[reorder['category_cleaned'] == g]) == 0:
            reorder = reorder.append({'category_cleaned': g, '%': 0}, ignore_index=True)
    # plot graph with annotations
    reorder = reorder.sort_values('%', ascending=False)
    reorder['hex'] = palette_topics[:-1]
    aux1 = pd.DataFrame(category, columns =['category_cleaned'])
    aux1 = aux1.merge(reorder, how = 'left', on = 'category_cleaned')
    sns.set_style('white')
    splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = palette_topics, order = category);
    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center', color =  '#707780')
    plt.title(f"{f}", color =  '#707780')
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 20)
    splot.axes.get_yaxis().set_visible(False)
    if e == 0:
        splot.axes.get_yaxis().set_visible(True)
    if e == 3:
        splot.axes.get_yaxis().set_visible(True)
    if e == 4:
        plt.xlabel("")
    if e != 4:
        splot.axes.get_xaxis().set_visible(False)
    #         splot.spines['right'].set_visible(False)
    #         splot.spines['top'].set_visible(False)
    #         splot.spines['left'].set_visible(False)
    #         splot.spines['bottom'].set_visible(False)
#     splot.axes.get_xaxis().set_visible(False)
    #         splot.axes.get_yaxis().set_visible(False)
    
    splot.yaxis.label.set_color('#707780')
    splot.spines['left'].set_color('#707780')
    splot.spines['right'].set_color('#707780')
    splot.spines['top'].set_color('#707780')
    splot.spines['bottom'].set_color('#707780')
    splot.tick_params(axis='y', colors='#707780',labelsize=16, pad = 30)

plt.subplots_adjust(hspace=0.1)
plt.subplots_adjust(wspace=0.1)
plt.show()

##### Producer Office - Brazil

In [None]:
country = 'BRAZIL'
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['user_office_name'] == country)]
# calculate repurchase
category = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'])
reorder = pd.DataFrame()
share_gmv = pd.DataFrame()
median_gmv = pd.DataFrame()
overall_gmv = pd.DataFrame()

for n, d in enumerate(category):
    ### count purchases per user
    aux1 = prov[(prov['category_cleaned'] == d)]
    ### global median
    global_med = pd.DataFrame({'category_cleaned' : [d], 'overall_median_gmv': aux1['gmv_value_brl'].median()})
    overall_gmv = pd.concat([overall_gmv, global_med])
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x > 1 else 'single-purchase')    
    
    ### calculate gmv captured by users who repurchase 
    share = aux1[['repurchase_temp', 'gmv_value_brl']].groupby('repurchase_temp').sum().reset_index()
    share['gmv_share_repurchase'] = share['gmv_value_brl'] / share['gmv_value_brl'].sum() * 100
    share = share[share['repurchase_temp'] == 'repurchase']
    share['category_cleaned'] = d
    share.columns = ['repurchase', 'gmv_value_brl_repurchase', 'gmv_share_repurchase', 'category_cleaned']
    share = share[['category_cleaned', 'gmv_share_repurchase', 'gmv_value_brl_repurchase']]
    share_gmv = share_gmv.append(share)

    med = aux1[['user_buyer_id', 'repurchase_temp', 
                'gmv_value_brl']].groupby(['user_buyer_id','repurchase_temp']).sum().reset_index()
    med = med[med['gmv_value_brl'] > 0]
    med = med[['repurchase_temp', 'gmv_value_brl']].groupby('repurchase_temp').median().reset_index()
    med = med[med['repurchase_temp'] == 'repurchase']
    med.columns = ['repurchase', 'median_gmv_repurchase']
    med['category_cleaned'] = d
    med = med[['category_cleaned', 'median_gmv_repurchase']]
    median_gmv = median_gmv.append(med)
    
    # get vol of users with purchase
    rr = aux1[['no_repurchase_temp','repurchase_temp',  'user_buyer_id']].groupby(['repurchase_temp','no_repurchase_temp']).nunique().reset_index()
    rr['%'] = rr['user_buyer_id'] / rr['user_buyer_id'].sum() * 100
    rr = rr.sort_values('%', ascending=False)
    # get users who repurchase
    rr2 = rr[['repurchase_temp', 'user_buyer_id']].groupby('repurchase_temp').sum().reset_index()
    rr2['%'] = rr2['user_buyer_id'] / rr2['user_buyer_id'].sum() * 100
    rr3 = rr2[rr2['repurchase_temp'] == 'repurchase'][['%']]
    rr3['category_cleaned'] = d
    rr3 = rr3[['category_cleaned', '%']]
    reorder = reorder.append(rr3)
    
# sort df
reorder = reorder.sort_values('%', ascending=False)
share_gmv = share_gmv.sort_values('gmv_share_repurchase', ascending=False)
median_gmv = median_gmv.sort_values('median_gmv_repurchase', ascending=False)

# set main dataset
reorder = reorder.merge(share_gmv, 
                        how = 'left', 
                        on = 'category_cleaned').merge(median_gmv, 
                                                    how = 'left', 
                                                    on = 'category_cleaned').merge(overall_gmv, 
                                                                        how = 'left', 
                                                                        on = 'category_cleaned').sort_values('%', ascending=False)
reorder['median_lower'] = reorder.apply(lambda x: 1 if x['median_gmv_repurchase'] < x['overall_median_gmv'] else 0, axis = 1)
reorder['diff_median'] = (reorder['median_gmv_repurchase'] - reorder['overall_median_gmv']) / reorder['overall_median_gmv'] * 100
reorder['diff_median_abs'] = abs(reorder['diff_median'])
# reorder[['gmv_share_repurchase', 'diff_median']].corr()
# sns.lmplot(x ='gmv_share_repurchase', y = 'diff_median', data = reorder);

# # set palette
# palette_categories_analysis = list()
# for n in list(reorder['category_cleaned']):
#     if int(reorder[reorder['category_cleaned'] == n]['median_lower']) == 1:
#         palette_categories_analysis = palette_categories_analysis + ['r']
#     else:
#         palette_categories_analysis = palette_categories_analysis + ['g']

# set palette
palette_categories_analysis = list()
for n in list(reorder['category_cleaned']):
        palette_categories_analysis = palette_categories_analysis + ['#EF4E23']
        
fonttitle = 45
annotate_size = 45
plt.figure(figsize = (35,40))
plt.subplot(141)
# plot graph with annotations

# '#707780' (grey)

sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = reorder, palette = palette_categories_analysis, order = reorder.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate(" %.1f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Reorder Rate (%)\n" + " \n" + " ", color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 60)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 30)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.xlabel(f"{aux3['user_buyer_id'].sum():,}", color = '#707780', fontsize = 30)

plt.subplot(142)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'gmv_share_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for p in splot.patches:
    splot.annotate(" %.1f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Generated GMV (%)\n" + "from buyers\n" + "who repurchase " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 60)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)


plt.subplot(143)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'overall_median_gmv', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for p in splot.patches:
    splot.annotate(" %.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Global Avg. Ticket\n" + "Price (BRL)\n" + " " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 2000)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)

plt.subplot(144)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'median_gmv_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for n, p in zip(reorder['diff_median'], splot.patches):
    splot.annotate(" %.0f" % p.get_width() + ' ' + "(%.1f p.p)" % n, xy=(p.get_width(), 
                                                                      p.get_y()+p.get_height()/2),xytext=(5,0), 
    textcoords='offset points', ha='left', va='center', fontsize = annotate_size, color = '#707780')

splot.set_title(f"Avg. Ticket Price (BRL)\n" + "from buyers who repurchase\n" + "(% diff from global ticket price)" , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 2000)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.05)
plt.show()

##### Producer Office - Colombia

In [None]:
country = 'COLOMBIA'
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['user_office_name'] == country)]
# calculate repurchase
category = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'])
reorder = pd.DataFrame()
share_gmv = pd.DataFrame()
median_gmv = pd.DataFrame()
overall_gmv = pd.DataFrame()

for n, d in enumerate(category):
    ### count purchases per user
    aux1 = prov[(prov['category_cleaned'] == d)]
    ### global median
    global_med = pd.DataFrame({'category_cleaned' : [d], 'overall_median_gmv': aux1['gmv_value_brl'].median()})
    overall_gmv = pd.concat([overall_gmv, global_med])
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x > 1 else 'single-purchase')    
    
    ### calculate gmv captured by users who repurchase 
    share = aux1[['repurchase_temp', 'gmv_value_brl']].groupby('repurchase_temp').sum().reset_index()
    share['gmv_share_repurchase'] = share['gmv_value_brl'] / share['gmv_value_brl'].sum() * 100
    share = share[share['repurchase_temp'] == 'repurchase']
    share['category_cleaned'] = d
    share.columns = ['repurchase', 'gmv_value_brl_repurchase', 'gmv_share_repurchase', 'category_cleaned']
    share = share[['category_cleaned', 'gmv_share_repurchase', 'gmv_value_brl_repurchase']]
    share_gmv = share_gmv.append(share)

    med = aux1[['user_buyer_id', 'repurchase_temp', 
                'gmv_value_brl']].groupby(['user_buyer_id','repurchase_temp']).sum().reset_index()
    med = med[med['gmv_value_brl'] > 0]
    med = med[['repurchase_temp', 'gmv_value_brl']].groupby('repurchase_temp').median().reset_index()
    med = med[med['repurchase_temp'] == 'repurchase']
    med.columns = ['repurchase', 'median_gmv_repurchase']
    med['category_cleaned'] = d
    med = med[['category_cleaned', 'median_gmv_repurchase']]
    median_gmv = median_gmv.append(med)
    
    # get vol of users with purchase
    rr = aux1[['no_repurchase_temp','repurchase_temp',  'user_buyer_id']].groupby(['repurchase_temp','no_repurchase_temp']).nunique().reset_index()
    rr['%'] = rr['user_buyer_id'] / rr['user_buyer_id'].sum() * 100
    rr = rr.sort_values('%', ascending=False)
    # get users who repurchase
    rr2 = rr[['repurchase_temp', 'user_buyer_id']].groupby('repurchase_temp').sum().reset_index()
    rr2['%'] = rr2['user_buyer_id'] / rr2['user_buyer_id'].sum() * 100
    rr3 = rr2[rr2['repurchase_temp'] == 'repurchase'][['%']]
    rr3['category_cleaned'] = d
    rr3 = rr3[['category_cleaned', '%']]
    reorder = reorder.append(rr3)
    
# sort df
reorder = reorder.sort_values('%', ascending=False)
share_gmv = share_gmv.sort_values('gmv_share_repurchase', ascending=False)
median_gmv = median_gmv.sort_values('median_gmv_repurchase', ascending=False)

# set main dataset
reorder = reorder.merge(share_gmv, 
                        how = 'left', 
                        on = 'category_cleaned').merge(median_gmv, 
                                                    how = 'left', 
                                                    on = 'category_cleaned').merge(overall_gmv, 
                                                                        how = 'left', 
                                                                        on = 'category_cleaned').sort_values('%', ascending=False)
reorder['median_lower'] = reorder.apply(lambda x: 1 if x['median_gmv_repurchase'] < x['overall_median_gmv'] else 0, axis = 1)
reorder['diff_median'] = (reorder['median_gmv_repurchase'] - reorder['overall_median_gmv']) / reorder['overall_median_gmv'] * 100
reorder['diff_median_abs'] = abs(reorder['diff_median'])
# reorder[['gmv_share_repurchase', 'diff_median']].corr()
# sns.lmplot(x ='gmv_share_repurchase', y = 'diff_median', data = reorder);

# # set palette
# palette_categories_analysis = list()
# for n in list(reorder['category_cleaned']):
#     if int(reorder[reorder['category_cleaned'] == n]['median_lower']) == 1:
#         palette_categories_analysis = palette_categories_analysis + ['r']
#     else:
#         palette_categories_analysis = palette_categories_analysis + ['g']

# set palette
palette_categories_analysis = list()
for n in list(reorder['category_cleaned']):
        palette_categories_analysis = palette_categories_analysis + ['#EF4E23']
        
fonttitle = 45
annotate_size = 45
plt.figure(figsize = (35,40))
plt.subplot(141)
# plot graph with annotations

# '#707780' (grey)

sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = reorder, palette = palette_categories_analysis, order = reorder.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate(" %.1f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Reorder Rate (%)\n" + " \n" + " ", color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 60)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 30)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.xlabel(f"{aux3['user_buyer_id'].sum():,}", color = '#707780', fontsize = 30)

plt.subplot(142)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'gmv_share_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for p in splot.patches:
    splot.annotate(" %.1f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Generated GMV (%)\n" + "from buyers\n" + "who repurchase " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 60)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)


plt.subplot(143)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'overall_median_gmv', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for p in splot.patches:
    splot.annotate(" %.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Global Avg. Ticket\n" + "Price (BRL)\n" + " " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 2000)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)

plt.subplot(144)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'median_gmv_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for n, p in zip(reorder['diff_median'], splot.patches):
    splot.annotate(" %.0f" % p.get_width() + ' ' + "(%.1f p.p)" % n, xy=(p.get_width(), 
                                                                      p.get_y()+p.get_height()/2),xytext=(5,0), 
    textcoords='offset points', ha='left', va='center', fontsize = annotate_size, color = '#707780')

splot.set_title(f"Avg. Ticket Price (BRL)\n" + "from buyers who repurchase\n" + "(% diff from global ticket price)" , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 2000)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.05)
plt.show()

##### Producer Office - Mexico

In [None]:
country = 'MEXICO'
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['user_office_name'] == country)]
# calculate repurchase
category = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'])
reorder = pd.DataFrame()
share_gmv = pd.DataFrame()
median_gmv = pd.DataFrame()
overall_gmv = pd.DataFrame()

for n, d in enumerate(category):
    ### count purchases per user
    aux1 = prov[(prov['category_cleaned'] == d)]
    ### global median
    global_med = pd.DataFrame({'category_cleaned' : [d], 'overall_median_gmv': aux1['gmv_value_brl'].median()})
    overall_gmv = pd.concat([overall_gmv, global_med])
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x > 1 else 'single-purchase')    
    
    ### calculate gmv captured by users who repurchase 
    share = aux1[['repurchase_temp', 'gmv_value_brl']].groupby('repurchase_temp').sum().reset_index()
    share['gmv_share_repurchase'] = share['gmv_value_brl'] / share['gmv_value_brl'].sum() * 100
    share = share[share['repurchase_temp'] == 'repurchase']
    share['category_cleaned'] = d
    share.columns = ['repurchase', 'gmv_value_brl_repurchase', 'gmv_share_repurchase', 'category_cleaned']
    share = share[['category_cleaned', 'gmv_share_repurchase', 'gmv_value_brl_repurchase']]
    share_gmv = share_gmv.append(share)

    med = aux1[['user_buyer_id', 'repurchase_temp', 
                'gmv_value_brl']].groupby(['user_buyer_id','repurchase_temp']).sum().reset_index()
    med = med[med['gmv_value_brl'] > 0]
    med = med[['repurchase_temp', 'gmv_value_brl']].groupby('repurchase_temp').median().reset_index()
    med = med[med['repurchase_temp'] == 'repurchase']
    med.columns = ['repurchase', 'median_gmv_repurchase']
    med['category_cleaned'] = d
    med = med[['category_cleaned', 'median_gmv_repurchase']]
    median_gmv = median_gmv.append(med)
    
    # get vol of users with purchase
    rr = aux1[['no_repurchase_temp','repurchase_temp',  'user_buyer_id']].groupby(['repurchase_temp','no_repurchase_temp']).nunique().reset_index()
    rr['%'] = rr['user_buyer_id'] / rr['user_buyer_id'].sum() * 100
    rr = rr.sort_values('%', ascending=False)
    # get users who repurchase
    rr2 = rr[['repurchase_temp', 'user_buyer_id']].groupby('repurchase_temp').sum().reset_index()
    rr2['%'] = rr2['user_buyer_id'] / rr2['user_buyer_id'].sum() * 100
    rr3 = rr2[rr2['repurchase_temp'] == 'repurchase'][['%']]
    rr3['category_cleaned'] = d
    rr3 = rr3[['category_cleaned', '%']]
    reorder = reorder.append(rr3)
    
# sort df
reorder = reorder.sort_values('%', ascending=False)
share_gmv = share_gmv.sort_values('gmv_share_repurchase', ascending=False)
median_gmv = median_gmv.sort_values('median_gmv_repurchase', ascending=False)

# set main dataset
reorder = reorder.merge(share_gmv, 
                        how = 'left', 
                        on = 'category_cleaned').merge(median_gmv, 
                                                    how = 'left', 
                                                    on = 'category_cleaned').merge(overall_gmv, 
                                                                        how = 'left', 
                                                                        on = 'category_cleaned').sort_values('%', ascending=False)
reorder['median_lower'] = reorder.apply(lambda x: 1 if x['median_gmv_repurchase'] < x['overall_median_gmv'] else 0, axis = 1)
reorder['diff_median'] = (reorder['median_gmv_repurchase'] - reorder['overall_median_gmv']) / reorder['overall_median_gmv'] * 100
reorder['diff_median_abs'] = abs(reorder['diff_median'])
# reorder[['gmv_share_repurchase', 'diff_median']].corr()
# sns.lmplot(x ='gmv_share_repurchase', y = 'diff_median', data = reorder);

# # set palette
# palette_categories_analysis = list()
# for n in list(reorder['category_cleaned']):
#     if int(reorder[reorder['category_cleaned'] == n]['median_lower']) == 1:
#         palette_categories_analysis = palette_categories_analysis + ['r']
#     else:
#         palette_categories_analysis = palette_categories_analysis + ['g']

# set palette
palette_categories_analysis = list()
for n in list(reorder['category_cleaned']):
        palette_categories_analysis = palette_categories_analysis + ['#EF4E23']
        
fonttitle = 45
annotate_size = 45
plt.figure(figsize = (35,40))
plt.subplot(141)
# plot graph with annotations

# '#707780' (grey)

sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = reorder, palette = palette_categories_analysis, order = reorder.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate(" %.1f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Reorder Rate (%)\n" + " \n" + " ", color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 60)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 30)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.xlabel(f"{aux3['user_buyer_id'].sum():,}", color = '#707780', fontsize = 30)

plt.subplot(142)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'gmv_share_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for p in splot.patches:
    splot.annotate(" %.1f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Generated GMV (%)\n" + "from buyers\n" + "who repurchase " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 60)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)


plt.subplot(143)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'overall_median_gmv', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for p in splot.patches:
    splot.annotate(" %.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Global Avg. Ticket\n" + "Price (BRL)\n" + " " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 2000)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)

plt.subplot(144)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'median_gmv_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for n, p in zip(reorder['diff_median'], splot.patches):
    splot.annotate(" %.0f" % p.get_width() + ' ' + "(%.1f p.p)" % n, xy=(p.get_width(), 
                                                                      p.get_y()+p.get_height()/2),xytext=(5,0), 
    textcoords='offset points', ha='left', va='center', fontsize = annotate_size, color = '#707780')

splot.set_title(f"Avg. Ticket Price (BRL)\n" + "from buyers who repurchase\n" + "(% diff from global ticket price)" , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 2000)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.05)
plt.show()

##### Producer Office - Spain

In [None]:
country = 'SPAIN'
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['user_office_name'] == country)]
# calculate repurchase
category = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'])
reorder = pd.DataFrame()
share_gmv = pd.DataFrame()
median_gmv = pd.DataFrame()
overall_gmv = pd.DataFrame()

for n, d in enumerate(category):
    ### count purchases per user
    aux1 = prov[(prov['category_cleaned'] == d)]
    ### global median
    global_med = pd.DataFrame({'category_cleaned' : [d], 'overall_median_gmv': aux1['gmv_value_brl'].median()})
    overall_gmv = pd.concat([overall_gmv, global_med])
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x > 1 else 'single-purchase')    
    
    ### calculate gmv captured by users who repurchase 
    share = aux1[['repurchase_temp', 'gmv_value_brl']].groupby('repurchase_temp').sum().reset_index()
    share['gmv_share_repurchase'] = share['gmv_value_brl'] / share['gmv_value_brl'].sum() * 100
    share = share[share['repurchase_temp'] == 'repurchase']
    share['category_cleaned'] = d
    share.columns = ['repurchase', 'gmv_value_brl_repurchase', 'gmv_share_repurchase', 'category_cleaned']
    share = share[['category_cleaned', 'gmv_share_repurchase', 'gmv_value_brl_repurchase']]
    share_gmv = share_gmv.append(share)

    med = aux1[['user_buyer_id', 'repurchase_temp', 
                'gmv_value_brl']].groupby(['user_buyer_id','repurchase_temp']).sum().reset_index()
    med = med[med['gmv_value_brl'] > 0]
    med = med[['repurchase_temp', 'gmv_value_brl']].groupby('repurchase_temp').median().reset_index()
    med = med[med['repurchase_temp'] == 'repurchase']
    med.columns = ['repurchase', 'median_gmv_repurchase']
    med['category_cleaned'] = d
    med = med[['category_cleaned', 'median_gmv_repurchase']]
    median_gmv = median_gmv.append(med)
    
    # get vol of users with purchase
    rr = aux1[['no_repurchase_temp','repurchase_temp',  'user_buyer_id']].groupby(['repurchase_temp','no_repurchase_temp']).nunique().reset_index()
    rr['%'] = rr['user_buyer_id'] / rr['user_buyer_id'].sum() * 100
    rr = rr.sort_values('%', ascending=False)
    # get users who repurchase
    rr2 = rr[['repurchase_temp', 'user_buyer_id']].groupby('repurchase_temp').sum().reset_index()
    rr2['%'] = rr2['user_buyer_id'] / rr2['user_buyer_id'].sum() * 100
    rr3 = rr2[rr2['repurchase_temp'] == 'repurchase'][['%']]
    rr3['category_cleaned'] = d
    rr3 = rr3[['category_cleaned', '%']]
    reorder = reorder.append(rr3)
    
# sort df
reorder = reorder.sort_values('%', ascending=False)
share_gmv = share_gmv.sort_values('gmv_share_repurchase', ascending=False)
median_gmv = median_gmv.sort_values('median_gmv_repurchase', ascending=False)

# set main dataset
reorder = reorder.merge(share_gmv, 
                        how = 'left', 
                        on = 'category_cleaned').merge(median_gmv, 
                                                    how = 'left', 
                                                    on = 'category_cleaned').merge(overall_gmv, 
                                                                        how = 'left', 
                                                                        on = 'category_cleaned').sort_values('%', ascending=False)
reorder['median_lower'] = reorder.apply(lambda x: 1 if x['median_gmv_repurchase'] < x['overall_median_gmv'] else 0, axis = 1)
reorder['diff_median'] = (reorder['median_gmv_repurchase'] - reorder['overall_median_gmv']) / reorder['overall_median_gmv'] * 100
reorder['diff_median_abs'] = abs(reorder['diff_median'])
# reorder[['gmv_share_repurchase', 'diff_median']].corr()
# sns.lmplot(x ='gmv_share_repurchase', y = 'diff_median', data = reorder);

# # set palette
# palette_categories_analysis = list()
# for n in list(reorder['category_cleaned']):
#     if int(reorder[reorder['category_cleaned'] == n]['median_lower']) == 1:
#         palette_categories_analysis = palette_categories_analysis + ['r']
#     else:
#         palette_categories_analysis = palette_categories_analysis + ['g']

# set palette
palette_categories_analysis = list()
for n in list(reorder['category_cleaned']):
        palette_categories_analysis = palette_categories_analysis + ['#EF4E23']
        
fonttitle = 45
annotate_size = 45
plt.figure(figsize = (35,40))
plt.subplot(141)
# plot graph with annotations

# '#707780' (grey)

sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = reorder, palette = palette_categories_analysis, order = reorder.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate(" %.1f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Reorder Rate (%)\n" + " \n" + " ", color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 60)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 30)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.xlabel(f"{aux3['user_buyer_id'].sum():,}", color = '#707780', fontsize = 30)

plt.subplot(142)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'gmv_share_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for p in splot.patches:
    splot.annotate(" %.1f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Generated GMV (%)\n" + "from buyers\n" + "who repurchase " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 60)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)


plt.subplot(143)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'overall_median_gmv', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for p in splot.patches:
    splot.annotate(" %.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Global Avg. Ticket\n" + "Price (BRL)\n" + " " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 2000)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)

plt.subplot(144)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'median_gmv_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for n, p in zip(reorder['diff_median'], splot.patches):
    splot.annotate(" %.0f" % p.get_width() + ' ' + "(%.1f p.p)" % n, xy=(p.get_width(), 
                                                                      p.get_y()+p.get_height()/2),xytext=(5,0), 
    textcoords='offset points', ha='left', va='center', fontsize = annotate_size, color = '#707780')

splot.set_title(f"Avg. Ticket Price (BRL)\n" + "from buyers who repurchase\n" + "(% diff from global ticket price)" , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 2000)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.05)
plt.show()

##### Producer Office - USA

In [None]:
country = 'USA'
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['user_office_name'] == country)]
# calculate repurchase
category = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'])
reorder = pd.DataFrame()
share_gmv = pd.DataFrame()
median_gmv = pd.DataFrame()
overall_gmv = pd.DataFrame()

for n, d in enumerate(category):
    ### count purchases per user
    aux1 = prov[(prov['category_cleaned'] == d)]
    ### global median
    global_med = pd.DataFrame({'category_cleaned' : [d], 'overall_median_gmv': aux1['gmv_value_brl'].median()})
    overall_gmv = pd.concat([overall_gmv, global_med])
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x > 1 else 'single-purchase')    
    
    ### calculate gmv captured by users who repurchase 
    share = aux1[['repurchase_temp', 'gmv_value_brl']].groupby('repurchase_temp').sum().reset_index()
    share['gmv_share_repurchase'] = share['gmv_value_brl'] / share['gmv_value_brl'].sum() * 100
    share = share[share['repurchase_temp'] == 'repurchase']
    share['category_cleaned'] = d
    share.columns = ['repurchase', 'gmv_value_brl_repurchase', 'gmv_share_repurchase', 'category_cleaned']
    share = share[['category_cleaned', 'gmv_share_repurchase', 'gmv_value_brl_repurchase']]
    share_gmv = share_gmv.append(share)

    med = aux1[['user_buyer_id', 'repurchase_temp', 
                'gmv_value_brl']].groupby(['user_buyer_id','repurchase_temp']).sum().reset_index()
    med = med[med['gmv_value_brl'] > 0]
    med = med[['repurchase_temp', 'gmv_value_brl']].groupby('repurchase_temp').median().reset_index()
    med = med[med['repurchase_temp'] == 'repurchase']
    med.columns = ['repurchase', 'median_gmv_repurchase']
    med['category_cleaned'] = d
    med = med[['category_cleaned', 'median_gmv_repurchase']]
    median_gmv = median_gmv.append(med)
    
    # get vol of users with purchase
    rr = aux1[['no_repurchase_temp','repurchase_temp',  'user_buyer_id']].groupby(['repurchase_temp','no_repurchase_temp']).nunique().reset_index()
    rr['%'] = rr['user_buyer_id'] / rr['user_buyer_id'].sum() * 100
    rr = rr.sort_values('%', ascending=False)
    # get users who repurchase
    rr2 = rr[['repurchase_temp', 'user_buyer_id']].groupby('repurchase_temp').sum().reset_index()
    rr2['%'] = rr2['user_buyer_id'] / rr2['user_buyer_id'].sum() * 100
    rr3 = rr2[rr2['repurchase_temp'] == 'repurchase'][['%']]
    rr3['category_cleaned'] = d
    rr3 = rr3[['category_cleaned', '%']]
    reorder = reorder.append(rr3)
    
# sort df
reorder = reorder.sort_values('%', ascending=False)
share_gmv = share_gmv.sort_values('gmv_share_repurchase', ascending=False)
median_gmv = median_gmv.sort_values('median_gmv_repurchase', ascending=False)

# set main dataset
reorder = reorder.merge(share_gmv, 
                        how = 'left', 
                        on = 'category_cleaned').merge(median_gmv, 
                                                    how = 'left', 
                                                    on = 'category_cleaned').merge(overall_gmv, 
                                                                        how = 'left', 
                                                                        on = 'category_cleaned').sort_values('%', ascending=False)
reorder['median_lower'] = reorder.apply(lambda x: 1 if x['median_gmv_repurchase'] < x['overall_median_gmv'] else 0, axis = 1)
reorder['diff_median'] = (reorder['median_gmv_repurchase'] - reorder['overall_median_gmv']) / reorder['overall_median_gmv'] * 100
reorder['diff_median_abs'] = abs(reorder['diff_median'])
# reorder[['gmv_share_repurchase', 'diff_median']].corr()
# sns.lmplot(x ='gmv_share_repurchase', y = 'diff_median', data = reorder);

# # set palette
# palette_categories_analysis = list()
# for n in list(reorder['category_cleaned']):
#     if int(reorder[reorder['category_cleaned'] == n]['median_lower']) == 1:
#         palette_categories_analysis = palette_categories_analysis + ['r']
#     else:
#         palette_categories_analysis = palette_categories_analysis + ['g']

# set palette
palette_categories_analysis = list()
for n in list(reorder['category_cleaned']):
        palette_categories_analysis = palette_categories_analysis + ['#EF4E23']
        
fonttitle = 45
annotate_size = 45
plt.figure(figsize = (35,40))
plt.subplot(141)
# plot graph with annotations

# '#707780' (grey)

sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = reorder, palette = palette_categories_analysis, order = reorder.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate(" %.1f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Reorder Rate (%)\n" + " \n" + " ", color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 60)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 30)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.xlabel(f"{aux3['user_buyer_id'].sum():,}", color = '#707780', fontsize = 30)

plt.subplot(142)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'gmv_share_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for p in splot.patches:
    splot.annotate(" %.1f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Generated GMV (%)\n" + "from buyers\n" + "who repurchase " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 60)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)


plt.subplot(143)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'overall_median_gmv', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for p in splot.patches:
    splot.annotate(" %.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Global Avg. Ticket\n" + "Price (BRL)\n" + " " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 2000)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)

plt.subplot(144)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'median_gmv_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for n, p in zip(reorder['diff_median'], splot.patches):
    splot.annotate(" %.0f" % p.get_width() + ' ' + "(%.1f p.p)" % n, xy=(p.get_width(), 
                                                                      p.get_y()+p.get_height()/2),xytext=(5,0), 
    textcoords='offset points', ha='left', va='center', fontsize = annotate_size, color = '#707780')

splot.set_title(f"Avg. Ticket Price (BRL)\n" + "from buyers who repurchase\n" + "(% diff from global ticket price)" , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 2000)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.05)
plt.show()

##### Producer Office - AMSTERDAM

In [None]:
country = 'AMSTERDAM'
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['user_office_name'] == country)]
# calculate repurchase
category = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'])
reorder = pd.DataFrame()
share_gmv = pd.DataFrame()
median_gmv = pd.DataFrame()
overall_gmv = pd.DataFrame()

for n, d in enumerate(category):
    ### count purchases per user
    aux1 = prov[(prov['category_cleaned'] == d)]
    ### global median
    global_med = pd.DataFrame({'category_cleaned' : [d], 'overall_median_gmv': aux1['gmv_value_brl'].median()})
    overall_gmv = pd.concat([overall_gmv, global_med])
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x > 1 else 'single-purchase')    
    
    ### calculate gmv captured by users who repurchase 
    share = aux1[['repurchase_temp', 'gmv_value_brl']].groupby('repurchase_temp').sum().reset_index()
    share['gmv_share_repurchase'] = share['gmv_value_brl'] / share['gmv_value_brl'].sum() * 100
    share = share[share['repurchase_temp'] == 'repurchase']
    share['category_cleaned'] = d
    share.columns = ['repurchase', 'gmv_value_brl_repurchase', 'gmv_share_repurchase', 'category_cleaned']
    share = share[['category_cleaned', 'gmv_share_repurchase', 'gmv_value_brl_repurchase']]
    share_gmv = share_gmv.append(share)

    med = aux1[['user_buyer_id', 'repurchase_temp', 
                'gmv_value_brl']].groupby(['user_buyer_id','repurchase_temp']).sum().reset_index()
    med = med[med['gmv_value_brl'] > 0]
    med = med[['repurchase_temp', 'gmv_value_brl']].groupby('repurchase_temp').median().reset_index()
    med = med[med['repurchase_temp'] == 'repurchase']
    med.columns = ['repurchase', 'median_gmv_repurchase']
    med['category_cleaned'] = d
    med = med[['category_cleaned', 'median_gmv_repurchase']]
    median_gmv = median_gmv.append(med)
    
    # get vol of users with purchase
    rr = aux1[['no_repurchase_temp','repurchase_temp',  'user_buyer_id']].groupby(['repurchase_temp','no_repurchase_temp']).nunique().reset_index()
    rr['%'] = rr['user_buyer_id'] / rr['user_buyer_id'].sum() * 100
    rr = rr.sort_values('%', ascending=False)
    # get users who repurchase
    rr2 = rr[['repurchase_temp', 'user_buyer_id']].groupby('repurchase_temp').sum().reset_index()
    rr2['%'] = rr2['user_buyer_id'] / rr2['user_buyer_id'].sum() * 100
    rr3 = rr2[rr2['repurchase_temp'] == 'repurchase'][['%']]
    rr3['category_cleaned'] = d
    rr3 = rr3[['category_cleaned', '%']]
    reorder = reorder.append(rr3)
    
# sort df
reorder = reorder.sort_values('%', ascending=False)
share_gmv = share_gmv.sort_values('gmv_share_repurchase', ascending=False)
median_gmv = median_gmv.sort_values('median_gmv_repurchase', ascending=False)

# set main dataset
reorder = reorder.merge(share_gmv, 
                        how = 'left', 
                        on = 'category_cleaned').merge(median_gmv, 
                                                    how = 'left', 
                                                    on = 'category_cleaned').merge(overall_gmv, 
                                                                        how = 'left', 
                                                                        on = 'category_cleaned').sort_values('%', ascending=False)
reorder['median_lower'] = reorder.apply(lambda x: 1 if x['median_gmv_repurchase'] < x['overall_median_gmv'] else 0, axis = 1)
reorder['diff_median'] = (reorder['median_gmv_repurchase'] - reorder['overall_median_gmv']) / reorder['overall_median_gmv'] * 100
reorder['diff_median_abs'] = abs(reorder['diff_median'])
# reorder[['gmv_share_repurchase', 'diff_median']].corr()
# sns.lmplot(x ='gmv_share_repurchase', y = 'diff_median', data = reorder);

# # set palette
# palette_categories_analysis = list()
# for n in list(reorder['category_cleaned']):
#     if int(reorder[reorder['category_cleaned'] == n]['median_lower']) == 1:
#         palette_categories_analysis = palette_categories_analysis + ['r']
#     else:
#         palette_categories_analysis = palette_categories_analysis + ['g']

# set palette
palette_categories_analysis = list()
for n in list(reorder['category_cleaned']):
        palette_categories_analysis = palette_categories_analysis + ['#EF4E23']
        
fonttitle = 45
annotate_size = 45
plt.figure(figsize = (35,40))
plt.subplot(141)
# plot graph with annotations

# '#707780' (grey)

sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = reorder, palette = palette_categories_analysis, order = reorder.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate(" %.1f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Reorder Rate (%)\n" + " \n" + " ", color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 60)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 30)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.xlabel(f"{aux3['user_buyer_id'].sum():,}", color = '#707780', fontsize = 30)

plt.subplot(142)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'gmv_share_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for p in splot.patches:
    splot.annotate(" %.1f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Generated GMV (%)\n" + "from buyers\n" + "who repurchase " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 60)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)


plt.subplot(143)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'overall_median_gmv', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for p in splot.patches:
    splot.annotate(" %.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Global Avg. Ticket\n" + "Price (BRL)\n" + " " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 2000)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)

plt.subplot(144)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'median_gmv_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for n, p in zip(reorder['diff_median'], splot.patches):
    splot.annotate(" %.0f" % p.get_width() + ' ' + "(%.1f p.p)" % n, xy=(p.get_width(), 
                                                                      p.get_y()+p.get_height()/2),xytext=(5,0), 
    textcoords='offset points', ha='left', va='center', fontsize = annotate_size, color = '#707780')

splot.set_title(f"Avg. Ticket Price (BRL)\n" + "from buyers who repurchase\n" + "(% diff from global ticket price)" , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 2000)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.05)
plt.show()

#### Producer Segment

- For this analysis, repurchases can be related to any producer within the same segment. 
- Premise: if a repurchase was related to a producer from a different segment, this purchase will be discarded. 

In [None]:
prov = df3[df3['category_cleaned'] != 'SPARKLE']
# get list
category = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'])
office = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])


plt.figure(figsize = (20,15))
for e, f in enumerate([ 'SEED',  'SMALL', 'MEDIUM', 'LARGE']):
    plt.subplot(1,4,e+1)
    aux0 = prov[prov['segment'] == f]
    aux1 = aux0[['purchase_id', 'category_cleaned']].groupby('category_cleaned').count().reset_index()
    aux1['%'] = aux1['purchase_id'] / aux1['purchase_id'].sum() * 100
    for g in category:
        if len(aux1[aux1['category_cleaned'] == g]) == 0:
            aux1 = aux1.append({'category_cleaned': g, 'purchase_id': 0, '%': 0}, ignore_index=True)
    # plot graph with annotations
    aux1 = aux1.sort_values('%', ascending=False)
    aux1['hex'] = palette_topics[:-1]
    aux2 = pd.DataFrame(category, columns =['category_cleaned'])
    aux2 = aux2.merge(aux1, how = 'left', on = 'category_cleaned')
    sns.set_style('white')
    splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux2, palette = list(aux2['hex']), order = category);
    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center', color =  '#707780')
    plt.title(f"{f} | {aux1['purchase_id'].sum():,}", color =  '#707780')
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 60)
    splot.axes.get_yaxis().set_visible(False)
    if e == 0:
        splot.axes.get_yaxis().set_visible(True)
    if e == 1:
        plt.xlabel("Num of Purchases (%)")
    if e != 1:
        splot.axes.get_xaxis().set_visible(False)
    #         splot.spines['right'].set_visible(False)
    #         splot.spines['top'].set_visible(False)
    #         splot.spines['left'].set_visible(False)
    #         splot.spines['bottom'].set_visible(False)
#     splot.axes.get_xaxis().set_visible(False)
    #         splot.axes.get_yaxis().set_visible(False)
plt.subplots_adjust(hspace=0.1)
plt.subplots_adjust(wspace=0.3)
plt.show()



In [None]:
prov = df3[df3['category_cleaned'] != 'SPARKLE']
# get list
category = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'])

plt.figure(figsize = (15,15))
for e, f in enumerate([ 'SEED',  'SMALL', 'MEDIUM', 'LARGE']):
    plt.subplot(1,4,e+1)
    reorder = pd.DataFrame()
    aux0 = prov[prov['segment'] == f]
    for n, d in enumerate(category):
        ### count purchases per user
        aux1 = aux0[aux0['category_cleaned'] == d]
        aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
        aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
        aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
        ### flag repurchase
        aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x > 1 else 'single-purchase')    

        # get vol of users with purchase
        aux1 = aux1[['no_repurchase_temp','repurchase_temp',  'user_buyer_id']].groupby(['repurchase_temp','no_repurchase_temp']).nunique().reset_index()
        aux1['%'] = aux1['user_buyer_id'] / aux1['user_buyer_id'].sum() * 100
        aux1 = aux1.sort_values('%', ascending=False)
        # get users who repurchase
        aux3 = aux1[['repurchase_temp', 'user_buyer_id']].groupby('repurchase_temp').sum().reset_index()
        aux3['%'] = aux3['user_buyer_id'] / aux3['user_buyer_id'].sum() * 100
        aux4 = aux3[aux3['repurchase_temp'] == 'repurchase'][['%']]
        aux4['category_cleaned'] = d
        aux4 = aux4[['category_cleaned', '%']]
        reorder = reorder.append(aux4)
    for g in category:
        if len(reorder[reorder['category_cleaned'] == g]) == 0:
            reorder = reorder.append({'category_cleaned': g, '%': 0}, ignore_index=True)
    # plot graph with annotations
    reorder = reorder.sort_values('%', ascending=False)
    reorder['hex'] = palette_topics[:-1]
    aux1 = pd.DataFrame(category, columns =['category_cleaned'])
    aux1 = aux1.merge(reorder, how = 'left', on = 'category_cleaned')
    sns.set_style('white')
    splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = list(aux1['hex']), order = category);
    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center', color =  '#707780')
    plt.title(f"{f}", color =  '#707780')
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 25)
    splot.axes.get_yaxis().set_visible(False)
    if e == 0:
        splot.axes.get_yaxis().set_visible(True)
    if e == 1:
        plt.xlabel("")
    if e != 1:
        splot.axes.get_xaxis().set_visible(False)
    splot.yaxis.label.set_color('#707780')
    splot.spines['left'].set_color('#707780')
    splot.spines['right'].set_color('#707780')
    splot.spines['top'].set_color('#707780')
    splot.spines['bottom'].set_color('#707780')
    splot.tick_params(axis='y', colors='#707780',labelsize=16, pad = 30)

#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.1)
plt.show()

##### Producer segment - SEED

In [None]:
segment = 'SEED'
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['segment'] == segment)]
# calculate repurchase
category = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'])
reorder = pd.DataFrame()
share_gmv = pd.DataFrame()
median_gmv = pd.DataFrame()
overall_gmv = pd.DataFrame()

for n, d in enumerate(category):
    ### count purchases per user
    aux1 = prov[(prov['category_cleaned'] == d)]
    ### global median
    global_med = pd.DataFrame({'category_cleaned' : [d], 'overall_median_gmv': aux1['gmv_value_brl'].median()})
    overall_gmv = pd.concat([overall_gmv, global_med])
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x > 1 else 'single-purchase')    
    
    ### calculate gmv captured by users who repurchase 
    share = aux1[['repurchase_temp', 'gmv_value_brl']].groupby('repurchase_temp').sum().reset_index()
    share['gmv_share_repurchase'] = share['gmv_value_brl'] / share['gmv_value_brl'].sum() * 100
    share = share[share['repurchase_temp'] == 'repurchase']
    share['category_cleaned'] = d
    share.columns = ['repurchase', 'gmv_value_brl_repurchase', 'gmv_share_repurchase', 'category_cleaned']
    share = share[['category_cleaned', 'gmv_share_repurchase', 'gmv_value_brl_repurchase']]
    share_gmv = share_gmv.append(share)

    med = aux1[['user_buyer_id', 'repurchase_temp', 
                'gmv_value_brl']].groupby(['user_buyer_id','repurchase_temp']).sum().reset_index()
    med = med[med['gmv_value_brl'] > 0]
    med = med[['repurchase_temp', 'gmv_value_brl']].groupby('repurchase_temp').median().reset_index()
    med = med[med['repurchase_temp'] == 'repurchase']
    med.columns = ['repurchase', 'median_gmv_repurchase']
    med['category_cleaned'] = d
    med = med[['category_cleaned', 'median_gmv_repurchase']]
    median_gmv = median_gmv.append(med)
    
    # get vol of users with purchase
    rr = aux1[['no_repurchase_temp','repurchase_temp',  'user_buyer_id']].groupby(['repurchase_temp','no_repurchase_temp']).nunique().reset_index()
    rr['%'] = rr['user_buyer_id'] / rr['user_buyer_id'].sum() * 100
    rr = rr.sort_values('%', ascending=False)
    # get users who repurchase
    rr2 = rr[['repurchase_temp', 'user_buyer_id']].groupby('repurchase_temp').sum().reset_index()
    rr2['%'] = rr2['user_buyer_id'] / rr2['user_buyer_id'].sum() * 100
    rr3 = rr2[rr2['repurchase_temp'] == 'repurchase'][['%']]
    rr3['category_cleaned'] = d
    rr3 = rr3[['category_cleaned', '%']]
    reorder = reorder.append(rr3)
    
# sort df
reorder = reorder.sort_values('%', ascending=False)
share_gmv = share_gmv.sort_values('gmv_share_repurchase', ascending=False)
median_gmv = median_gmv.sort_values('median_gmv_repurchase', ascending=False)

# set main dataset
reorder = reorder.merge(share_gmv, 
                        how = 'left', 
                        on = 'category_cleaned').merge(median_gmv, 
                                                    how = 'left', 
                                                    on = 'category_cleaned').merge(overall_gmv, 
                                                                        how = 'left', 
                                                                        on = 'category_cleaned').sort_values('%', ascending=False)
reorder['median_lower'] = reorder.apply(lambda x: 1 if x['median_gmv_repurchase'] < x['overall_median_gmv'] else 0, axis = 1)
reorder['diff_median'] = (reorder['median_gmv_repurchase'] - reorder['overall_median_gmv']) / reorder['overall_median_gmv'] * 100
reorder['diff_median_abs'] = abs(reorder['diff_median'])
# reorder[['gmv_share_repurchase', 'diff_median']].corr()
# sns.lmplot(x ='gmv_share_repurchase', y = 'diff_median', data = reorder);

# # set palette
# palette_categories_analysis = list()
# for n in list(reorder['category_cleaned']):
#     if int(reorder[reorder['category_cleaned'] == n]['median_lower']) == 1:
#         palette_categories_analysis = palette_categories_analysis + ['r']
#     else:
#         palette_categories_analysis = palette_categories_analysis + ['g']

# set palette
palette_categories_analysis = list()
for n in list(reorder['category_cleaned']):
        palette_categories_analysis = palette_categories_analysis + ['#EF4E23']
        
fonttitle = 45
annotate_size = 45
plt.figure(figsize = (35,40))
plt.subplot(141)
# plot graph with annotations

# '#707780' (grey)

sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = reorder, palette = palette_categories_analysis, order = reorder.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate(" %.1f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Reorder Rate (%)\n" + " \n" + " ", color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 60)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 30)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.xlabel(f"{aux3['user_buyer_id'].sum():,}", color = '#707780', fontsize = 30)

plt.subplot(142)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'gmv_share_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for p in splot.patches:
    splot.annotate(" %.1f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Generated GMV (%)\n" + "from buyers\n" + "who repurchase " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 60)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)


plt.subplot(143)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'overall_median_gmv', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for p in splot.patches:
    splot.annotate(" %.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Global Avg. Ticket\n" + "Price (BRL)\n" + " " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 2000)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)

plt.subplot(144)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'median_gmv_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for n, p in zip(reorder['diff_median'], splot.patches):
    splot.annotate(" %.0f" % p.get_width() + ' ' + "(%.1f p.p)" % n, xy=(p.get_width(), 
                                                                      p.get_y()+p.get_height()/2),xytext=(5,0), 
    textcoords='offset points', ha='left', va='center', fontsize = annotate_size, color = '#707780')

splot.set_title(f"Avg. Ticket Price (BRL)\n" + "from buyers who repurchase\n" + "(% diff from global ticket price)" , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 2000)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.05)
plt.show()

##### Producer segment - SMALL

In [None]:
segment = 'SMALL'
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['segment'] == segment)]
# calculate repurchase
category = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'])
reorder = pd.DataFrame()
share_gmv = pd.DataFrame()
median_gmv = pd.DataFrame()
overall_gmv = pd.DataFrame()

for n, d in enumerate(category):
    ### count purchases per user
    aux1 = prov[(prov['category_cleaned'] == d)]
    ### global median
    global_med = pd.DataFrame({'category_cleaned' : [d], 'overall_median_gmv': aux1['gmv_value_brl'].median()})
    overall_gmv = pd.concat([overall_gmv, global_med])
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x > 1 else 'single-purchase')    
    
    ### calculate gmv captured by users who repurchase 
    share = aux1[['repurchase_temp', 'gmv_value_brl']].groupby('repurchase_temp').sum().reset_index()
    share['gmv_share_repurchase'] = share['gmv_value_brl'] / share['gmv_value_brl'].sum() * 100
    share = share[share['repurchase_temp'] == 'repurchase']
    share['category_cleaned'] = d
    share.columns = ['repurchase', 'gmv_value_brl_repurchase', 'gmv_share_repurchase', 'category_cleaned']
    share = share[['category_cleaned', 'gmv_share_repurchase', 'gmv_value_brl_repurchase']]
    share_gmv = share_gmv.append(share)

    med = aux1[['user_buyer_id', 'repurchase_temp', 
                'gmv_value_brl']].groupby(['user_buyer_id','repurchase_temp']).sum().reset_index()
    med = med[med['gmv_value_brl'] > 0]
    med = med[['repurchase_temp', 'gmv_value_brl']].groupby('repurchase_temp').median().reset_index()
    med = med[med['repurchase_temp'] == 'repurchase']
    med.columns = ['repurchase', 'median_gmv_repurchase']
    med['category_cleaned'] = d
    med = med[['category_cleaned', 'median_gmv_repurchase']]
    median_gmv = median_gmv.append(med)
    
    # get vol of users with purchase
    rr = aux1[['no_repurchase_temp','repurchase_temp',  'user_buyer_id']].groupby(['repurchase_temp','no_repurchase_temp']).nunique().reset_index()
    rr['%'] = rr['user_buyer_id'] / rr['user_buyer_id'].sum() * 100
    rr = rr.sort_values('%', ascending=False)
    # get users who repurchase
    rr2 = rr[['repurchase_temp', 'user_buyer_id']].groupby('repurchase_temp').sum().reset_index()
    rr2['%'] = rr2['user_buyer_id'] / rr2['user_buyer_id'].sum() * 100
    rr3 = rr2[rr2['repurchase_temp'] == 'repurchase'][['%']]
    rr3['category_cleaned'] = d
    rr3 = rr3[['category_cleaned', '%']]
    reorder = reorder.append(rr3)
    
# sort df
reorder = reorder.sort_values('%', ascending=False)
share_gmv = share_gmv.sort_values('gmv_share_repurchase', ascending=False)
median_gmv = median_gmv.sort_values('median_gmv_repurchase', ascending=False)

# set main dataset
reorder = reorder.merge(share_gmv, 
                        how = 'left', 
                        on = 'category_cleaned').merge(median_gmv, 
                                                    how = 'left', 
                                                    on = 'category_cleaned').merge(overall_gmv, 
                                                                        how = 'left', 
                                                                        on = 'category_cleaned').sort_values('%', ascending=False)
reorder['median_lower'] = reorder.apply(lambda x: 1 if x['median_gmv_repurchase'] < x['overall_median_gmv'] else 0, axis = 1)
reorder['diff_median'] = (reorder['median_gmv_repurchase'] - reorder['overall_median_gmv']) / reorder['overall_median_gmv'] * 100
reorder['diff_median_abs'] = abs(reorder['diff_median'])
# reorder[['gmv_share_repurchase', 'diff_median']].corr()
# sns.lmplot(x ='gmv_share_repurchase', y = 'diff_median', data = reorder);

# # set palette
# palette_categories_analysis = list()
# for n in list(reorder['category_cleaned']):
#     if int(reorder[reorder['category_cleaned'] == n]['median_lower']) == 1:
#         palette_categories_analysis = palette_categories_analysis + ['r']
#     else:
#         palette_categories_analysis = palette_categories_analysis + ['g']

# set palette
palette_categories_analysis = list()
for n in list(reorder['category_cleaned']):
        palette_categories_analysis = palette_categories_analysis + ['#EF4E23']
        
fonttitle = 45
annotate_size = 45
plt.figure(figsize = (35,40))
plt.subplot(141)
# plot graph with annotations

# '#707780' (grey)

sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = reorder, palette = palette_categories_analysis, order = reorder.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate(" %.1f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Reorder Rate (%)\n" + " \n" + " ", color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 60)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 30)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.xlabel(f"{aux3['user_buyer_id'].sum():,}", color = '#707780', fontsize = 30)

plt.subplot(142)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'gmv_share_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for p in splot.patches:
    splot.annotate(" %.1f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Generated GMV (%)\n" + "from buyers\n" + "who repurchase " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 60)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)


plt.subplot(143)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'overall_median_gmv', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for p in splot.patches:
    splot.annotate(" %.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Global Avg. Ticket\n" + "Price (BRL)\n" + " " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 2000)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)

plt.subplot(144)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'median_gmv_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for n, p in zip(reorder['diff_median'], splot.patches):
    splot.annotate(" %.0f" % p.get_width() + ' ' + "(%.1f p.p)" % n, xy=(p.get_width(), 
                                                                      p.get_y()+p.get_height()/2),xytext=(5,0), 
    textcoords='offset points', ha='left', va='center', fontsize = annotate_size, color = '#707780')

splot.set_title(f"Avg. Ticket Price (BRL)\n" + "from buyers who repurchase\n" + "(% diff from global ticket price)" , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 2000)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.05)
plt.show()

##### Producer segment - MEDIUM

In [None]:
segment = 'MEDIUM'
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['segment'] == segment)]
# calculate repurchase
category = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'])
reorder = pd.DataFrame()
share_gmv = pd.DataFrame()
median_gmv = pd.DataFrame()
overall_gmv = pd.DataFrame()

for n, d in enumerate(category):
    ### count purchases per user
    aux1 = prov[(prov['category_cleaned'] == d)]
    ### global median
    global_med = pd.DataFrame({'category_cleaned' : [d], 'overall_median_gmv': aux1['gmv_value_brl'].median()})
    overall_gmv = pd.concat([overall_gmv, global_med])
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x > 1 else 'single-purchase')    
    
    ### calculate gmv captured by users who repurchase 
    share = aux1[['repurchase_temp', 'gmv_value_brl']].groupby('repurchase_temp').sum().reset_index()
    share['gmv_share_repurchase'] = share['gmv_value_brl'] / share['gmv_value_brl'].sum() * 100
    share = share[share['repurchase_temp'] == 'repurchase']
    share['category_cleaned'] = d
    share.columns = ['repurchase', 'gmv_value_brl_repurchase', 'gmv_share_repurchase', 'category_cleaned']
    share = share[['category_cleaned', 'gmv_share_repurchase', 'gmv_value_brl_repurchase']]
    share_gmv = share_gmv.append(share)

    med = aux1[['user_buyer_id', 'repurchase_temp', 
                'gmv_value_brl']].groupby(['user_buyer_id','repurchase_temp']).sum().reset_index()
    med = med[med['gmv_value_brl'] > 0]
    med = med[['repurchase_temp', 'gmv_value_brl']].groupby('repurchase_temp').median().reset_index()
    med = med[med['repurchase_temp'] == 'repurchase']
    med.columns = ['repurchase', 'median_gmv_repurchase']
    med['category_cleaned'] = d
    med = med[['category_cleaned', 'median_gmv_repurchase']]
    median_gmv = median_gmv.append(med)
    
    # get vol of users with purchase
    rr = aux1[['no_repurchase_temp','repurchase_temp',  'user_buyer_id']].groupby(['repurchase_temp','no_repurchase_temp']).nunique().reset_index()
    rr['%'] = rr['user_buyer_id'] / rr['user_buyer_id'].sum() * 100
    rr = rr.sort_values('%', ascending=False)
    # get users who repurchase
    rr2 = rr[['repurchase_temp', 'user_buyer_id']].groupby('repurchase_temp').sum().reset_index()
    rr2['%'] = rr2['user_buyer_id'] / rr2['user_buyer_id'].sum() * 100
    rr3 = rr2[rr2['repurchase_temp'] == 'repurchase'][['%']]
    rr3['category_cleaned'] = d
    rr3 = rr3[['category_cleaned', '%']]
    reorder = reorder.append(rr3)
    
# sort df
reorder = reorder.sort_values('%', ascending=False)
share_gmv = share_gmv.sort_values('gmv_share_repurchase', ascending=False)
median_gmv = median_gmv.sort_values('median_gmv_repurchase', ascending=False)

# set main dataset
reorder = reorder.merge(share_gmv, 
                        how = 'left', 
                        on = 'category_cleaned').merge(median_gmv, 
                                                    how = 'left', 
                                                    on = 'category_cleaned').merge(overall_gmv, 
                                                                        how = 'left', 
                                                                        on = 'category_cleaned').sort_values('%', ascending=False)
reorder['median_lower'] = reorder.apply(lambda x: 1 if x['median_gmv_repurchase'] < x['overall_median_gmv'] else 0, axis = 1)
reorder['diff_median'] = (reorder['median_gmv_repurchase'] - reorder['overall_median_gmv']) / reorder['overall_median_gmv'] * 100
reorder['diff_median_abs'] = abs(reorder['diff_median'])
# reorder[['gmv_share_repurchase', 'diff_median']].corr()
# sns.lmplot(x ='gmv_share_repurchase', y = 'diff_median', data = reorder);

# # set palette
# palette_categories_analysis = list()
# for n in list(reorder['category_cleaned']):
#     if int(reorder[reorder['category_cleaned'] == n]['median_lower']) == 1:
#         palette_categories_analysis = palette_categories_analysis + ['r']
#     else:
#         palette_categories_analysis = palette_categories_analysis + ['g']

# set palette
palette_categories_analysis = list()
for n in list(reorder['category_cleaned']):
        palette_categories_analysis = palette_categories_analysis + ['#EF4E23']
        
fonttitle = 45
annotate_size = 45
plt.figure(figsize = (35,40))
plt.subplot(141)
# plot graph with annotations

# '#707780' (grey)

sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = reorder, palette = palette_categories_analysis, order = reorder.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate(" %.1f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Reorder Rate (%)\n" + " \n" + " ", color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 60)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 30)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.xlabel(f"{aux3['user_buyer_id'].sum():,}", color = '#707780', fontsize = 30)

plt.subplot(142)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'gmv_share_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for p in splot.patches:
    splot.annotate(" %.1f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Generated GMV (%)\n" + "from buyers\n" + "who repurchase " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 60)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)


plt.subplot(143)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'overall_median_gmv', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for p in splot.patches:
    splot.annotate(" %.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Global Avg. Ticket\n" + "Price (BRL)\n" + " " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 2000)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)

plt.subplot(144)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'median_gmv_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for n, p in zip(reorder['diff_median'], splot.patches):
    splot.annotate(" %.0f" % p.get_width() + ' ' + "(%.1f p.p)" % n, xy=(p.get_width(), 
                                                                      p.get_y()+p.get_height()/2),xytext=(5,0), 
    textcoords='offset points', ha='left', va='center', fontsize = annotate_size, color = '#707780')

splot.set_title(f"Avg. Ticket Price (BRL)\n" + "from buyers who repurchase\n" + "(% diff from global ticket price)" , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 2000)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.05)
plt.show()

##### Producer segment - LARGE

In [None]:
segment = 'LARGE'
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['segment'] == segment)]
# calculate repurchase
category = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'])
reorder = pd.DataFrame()
share_gmv = pd.DataFrame()
median_gmv = pd.DataFrame()
overall_gmv = pd.DataFrame()

for n, d in enumerate(category):
    ### count purchases per user
    aux1 = prov[(prov['category_cleaned'] == d)]
    ### global median
    global_med = pd.DataFrame({'category_cleaned' : [d], 'overall_median_gmv': aux1['gmv_value_brl'].median()})
    overall_gmv = pd.concat([overall_gmv, global_med])
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    ### flag repurchase
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x > 1 else 'single-purchase')    
    
    ### calculate gmv captured by users who repurchase 
    share = aux1[['repurchase_temp', 'gmv_value_brl']].groupby('repurchase_temp').sum().reset_index()
    share['gmv_share_repurchase'] = share['gmv_value_brl'] / share['gmv_value_brl'].sum() * 100
    share = share[share['repurchase_temp'] == 'repurchase']
    share['category_cleaned'] = d
    share.columns = ['repurchase', 'gmv_value_brl_repurchase', 'gmv_share_repurchase', 'category_cleaned']
    share = share[['category_cleaned', 'gmv_share_repurchase', 'gmv_value_brl_repurchase']]
    share_gmv = share_gmv.append(share)

    med = aux1[['user_buyer_id', 'repurchase_temp', 
                'gmv_value_brl']].groupby(['user_buyer_id','repurchase_temp']).sum().reset_index()
    med = med[med['gmv_value_brl'] > 0]
    med = med[['repurchase_temp', 'gmv_value_brl']].groupby('repurchase_temp').median().reset_index()
    med = med[med['repurchase_temp'] == 'repurchase']
    med.columns = ['repurchase', 'median_gmv_repurchase']
    med['category_cleaned'] = d
    med = med[['category_cleaned', 'median_gmv_repurchase']]
    median_gmv = median_gmv.append(med)
    
    # get vol of users with purchase
    rr = aux1[['no_repurchase_temp','repurchase_temp',  'user_buyer_id']].groupby(['repurchase_temp','no_repurchase_temp']).nunique().reset_index()
    rr['%'] = rr['user_buyer_id'] / rr['user_buyer_id'].sum() * 100
    rr = rr.sort_values('%', ascending=False)
    # get users who repurchase
    rr2 = rr[['repurchase_temp', 'user_buyer_id']].groupby('repurchase_temp').sum().reset_index()
    rr2['%'] = rr2['user_buyer_id'] / rr2['user_buyer_id'].sum() * 100
    rr3 = rr2[rr2['repurchase_temp'] == 'repurchase'][['%']]
    rr3['category_cleaned'] = d
    rr3 = rr3[['category_cleaned', '%']]
    reorder = reorder.append(rr3)
    
# sort df
reorder = reorder.sort_values('%', ascending=False)
share_gmv = share_gmv.sort_values('gmv_share_repurchase', ascending=False)
median_gmv = median_gmv.sort_values('median_gmv_repurchase', ascending=False)

# set main dataset
reorder = reorder.merge(share_gmv, 
                        how = 'left', 
                        on = 'category_cleaned').merge(median_gmv, 
                                                    how = 'left', 
                                                    on = 'category_cleaned').merge(overall_gmv, 
                                                                        how = 'left', 
                                                                        on = 'category_cleaned').sort_values('%', ascending=False)
reorder['median_lower'] = reorder.apply(lambda x: 1 if x['median_gmv_repurchase'] < x['overall_median_gmv'] else 0, axis = 1)
reorder['diff_median'] = (reorder['median_gmv_repurchase'] - reorder['overall_median_gmv']) / reorder['overall_median_gmv'] * 100
reorder['diff_median_abs'] = abs(reorder['diff_median'])
# reorder[['gmv_share_repurchase', 'diff_median']].corr()
# sns.lmplot(x ='gmv_share_repurchase', y = 'diff_median', data = reorder);

# # set palette
# palette_categories_analysis = list()
# for n in list(reorder['category_cleaned']):
#     if int(reorder[reorder['category_cleaned'] == n]['median_lower']) == 1:
#         palette_categories_analysis = palette_categories_analysis + ['r']
#     else:
#         palette_categories_analysis = palette_categories_analysis + ['g']

# set palette
palette_categories_analysis = list()
for n in list(reorder['category_cleaned']):
        palette_categories_analysis = palette_categories_analysis + ['#EF4E23']
        
fonttitle = 45
annotate_size = 45

plt.figure(figsize = (35,40))
plt.subplot(141)
# plot graph with annotations

# '#707780' (grey)

sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = reorder, palette = palette_categories_analysis, order = reorder.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate(" %.1f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Reorder Rate (%)\n" + " \n" + " ", color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 60)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 30)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.xlabel(f"{aux3['user_buyer_id'].sum():,}", color = '#707780', fontsize = 30)

plt.subplot(142)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'gmv_share_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for p in splot.patches:
    splot.annotate(" %.1f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Generated GMV (%)\n" + "from buyers\n" + "who repurchase " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 60)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)


plt.subplot(143)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'overall_median_gmv', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for p in splot.patches:
    splot.annotate(" %.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Global Avg. Ticket\n" + "Price (BRL)\n" + " " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 4000)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)

plt.subplot(144)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'median_gmv_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for n, p in zip(reorder['diff_median'], splot.patches):
    splot.annotate(" %.0f" % p.get_width() + ' ' + "(%.1f p.p)" % n, xy=(p.get_width(), 
                                                                      p.get_y()+p.get_height()/2),xytext=(5,0), 
    textcoords='offset points', ha='left', va='center', fontsize = annotate_size, color = '#707780')

splot.set_title(f"Avg. Ticket Price (BRL)\n" + "from buyers who repurchase\n" + "(% diff from global ticket price)" , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 4000)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.05)
plt.show()


In [None]:
plt.figure(figsize = (35,40))
plt.subplot(141)
# plot graph with annotations

# '#707780' (grey)

sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = '%', data = reorder, palette = palette_categories_analysis, order = reorder.sort_values('%', ascending=False)['category_cleaned']);

for p in splot.patches:
    splot.annotate(" %.1f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Reorder Rate (%)\n" + " \n" + " ", color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 60)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 30)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.xlabel(f"{aux3['user_buyer_id'].sum():,}", color = '#707780', fontsize = 30)

plt.subplot(142)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'gmv_share_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for p in splot.patches:
    splot.annotate(" %.1f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Generated GMV (%)\n" + "from buyers\n" + "who repurchase " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 60)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)


plt.subplot(143)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'overall_median_gmv', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for p in splot.patches:
    splot.annotate(" %.0f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Global Avg. Ticket\n" + "Price (BRL)\n" + " " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 4000)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)

plt.subplot(144)
sns.set_style('white')
splot = sns.barplot(y = 'category_cleaned', x = 'median_gmv_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = list(reorder.sort_values('%', ascending=False)['category_cleaned']));

for n, p in zip(reorder['diff_median'], splot.patches):
    splot.annotate(" %.0f" % p.get_width() + ' ' + "(%.1f p.p)" % n, xy=(p.get_width(), 
                                                                      p.get_y()+p.get_height()/2),xytext=(5,0), 
    textcoords='offset points', ha='left', va='center', fontsize = annotate_size, color = '#707780')

splot.set_title(f"Avg. Ticket Price (BRL)\n" + "from buyers who repurchase\n" + "(% diff from global ticket price)" , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 4000)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.05)
plt.show()

#### Producers < 10k BRL LTV

- For this analysis, repurchases can be related to any producer within the same threshold. 
- Premise: if a repurchase was related to a producer from a different threshold, this purchase will be discarded. 

In [None]:
prov = df3[df3['category_cleaned'] != 'SPARKLE']
# get list
category = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'])
office = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])


plt.figure(figsize = (20,15))
for e, f in enumerate(['below', 'above']):
    plt.subplot(1,2,e+1)
    aux0 = prov[prov['is_below_10_entire'] == f]
    aux1 = aux0[['purchase_id', 'category_cleaned']].groupby('category_cleaned').count().reset_index()
    aux1['%'] = aux1['purchase_id'] / aux1['purchase_id'].sum() * 100
    for g in category:
        if len(aux1[aux1['category_cleaned'] == g]) == 0:
            aux1 = aux1.append({'category_cleaned': g, 'purchase_id': 0, '%': 0}, ignore_index=True)
    # plot graph with annotations
    aux1 = aux1.sort_values('%', ascending=False)
    aux1['hex'] = palette_topics[:-1]
    aux2 = pd.DataFrame(category, columns =['category_cleaned'])
    aux2 = aux2.merge(aux1, how = 'left', on = 'category_cleaned')
    sns.set_style('white')
    splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux2, palette = list(aux2['hex']), order = category);
    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{f} | {aux1['purchase_id'].sum():,}")
    plt.yticks(fontsize =16)
    plt.xlabel("Num of Purchases (%)")
    plt.ylabel("")
    plt.xlim(0, 100)
    splot.axes.get_yaxis().set_visible(False)
    if e == 0:
        splot.axes.get_yaxis().set_visible(True)
    #         splot.spines['right'].set_visible(False)
    #         splot.spines['top'].set_visible(False)
    #         splot.spines['left'].set_visible(False)
    #         splot.spines['bottom'].set_visible(False)
#     splot.axes.get_xaxis().set_visible(False)
    #         splot.axes.get_yaxis().set_visible(False)
plt.subplots_adjust(hspace=0.1)
plt.subplots_adjust(wspace=0.3)
plt.show()

In [None]:
prov = df3[df3['category_cleaned'] != 'SPARKLE']
# get list
category = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'])

plt.figure(figsize = (12,12))
for e, f in enumerate(['below', 'above']):
    plt.subplot(1,2,e+1)
    reorder = pd.DataFrame()
    aux0 = prov[prov['is_below_10_entire'] == f]
    for n, d in enumerate(category):
        ### count purchases per user
        aux1 = aux0[aux0['category_cleaned'] == d]
        aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
        aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
        aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
        ### flag repurchase
        aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x > 1 else 'single-purchase')    

        # get vol of users with purchase
        aux1 = aux1[['no_repurchase_temp','repurchase_temp',  'user_buyer_id']].groupby(['repurchase_temp','no_repurchase_temp']).nunique().reset_index()
        aux1['%'] = aux1['user_buyer_id'] / aux1['user_buyer_id'].sum() * 100
        aux1 = aux1.sort_values('%', ascending=False)
        # get users who repurchase
        aux3 = aux1[['repurchase_temp', 'user_buyer_id']].groupby('repurchase_temp').sum().reset_index()
        aux3['%'] = aux3['user_buyer_id'] / aux3['user_buyer_id'].sum() * 100
        aux4 = aux3[aux3['repurchase_temp'] == 'repurchase'][['%']]
        aux4['category_cleaned'] = d
        aux4 = aux4[['category_cleaned', '%']]
        reorder = reorder.append(aux4)
    for g in category:
        if len(reorder[reorder['category_cleaned'] == g]) == 0:
            reorder = reorder.append({'category_cleaned': g, '%': 0}, ignore_index=True)
    # plot graph with annotations
    reorder = reorder.sort_values('%', ascending=False)
    reorder['hex'] = palette_topics[:-1]
    aux1 = pd.DataFrame(category, columns =['category_cleaned'])
    aux1 = aux1.merge(reorder, how = 'left', on = 'category_cleaned')
    sns.set_style('white')
    splot = sns.barplot(y = 'category_cleaned', x = '%', data = aux1, palette = list(aux1['hex']), order = category);
    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{f}")
    plt.yticks(fontsize =16)
    plt.xlabel("Reorder Rate (%)")
    plt.ylabel("")
    plt.xlim(0, 80)
    splot.axes.get_yaxis().set_visible(False)
    if e == 0:
        splot.axes.get_yaxis().set_visible(True)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.1)
plt.show()

### (MATRIX) What are the categories with higher reorder rate among repurchases? (by customer country, producer office, producer segment, producers below 10k BRL LTV)

- In this analysis, reorder rate is a bit different from previous definition. Here we define reorder rate as the number of times that a repurchase occurred for a pair of categories.

#### Overall

In [None]:
# set df
prov = df3[df3['category_cleaned'] != 'SPARKLE']
# get rank of purchases
prov['category_cleaned2'] = prov.sort_values('purchase_release_datetime').groupby('user_buyer_id')['category_cleaned'].shift(-1)
prov = prov[prov['category_cleaned2'].notnull()]
prov = prov[['category_cleaned', 'category_cleaned2', 'purchase_id']].groupby(['category_cleaned', 'category_cleaned2']).count().reset_index()
prov['%'] = prov['purchase_id'] / prov['purchase_id'].sum()
sample_size = prov['purchase_id'].sum()
prov = prov.drop('purchase_id', axis = 1).pivot_table(index = 'category_cleaned', values = '%', columns = 'category_cleaned2')#.unstack(level=0).T.reset_index()

fig, ax = plt.subplots(figsize=(30, 30))

# # color map
# cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
# plot heatmap
sns.heatmap(prov,annot=True, fmt=".1%", 
           linewidths=5,
           cbar_kws={"shrink": .5, 'format': PercentFormatter(1)}, square=True, ax=ax)
# ticks
# yticks = [i.upper() for i in prov.index]
# xticks = [i.upper() for i in prov.columns]
plt.title(f"Reorder Rate (%) | Repurchases: {sample_size:,}")
plt.yticks(plt.yticks()[0], labels=yticks, rotation=0)
plt.xticks(plt.xticks()[0], labels=xticks, rotation=90, fontsize = 20)
plt.ylabel('1st purchase')
plt.xlabel('2nd purchase')

plt.show()

#### Overall (TOP 10 Categories - Transaction Volume)

In [None]:
# set df
prov = df3[df3['category_cleaned'] != 'SPARKLE']
other_cat = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'][10::])
for n in other_cat:
    prov.loc[prov['category_cleaned'] == n, 'category_cleaned'] = 'Others'
# set df
# get rank of purchases
prov['category_cleaned2'] = prov.sort_values('purchase_release_datetime').groupby('user_buyer_id')['category_cleaned'].shift(-1)
prov = prov[prov['category_cleaned2'].notnull()]
prov = prov[['category_cleaned', 'category_cleaned2', 'purchase_id']].groupby(['category_cleaned', 'category_cleaned2']).count().reset_index()
prov['%'] = prov['purchase_id'] / prov['purchase_id'].sum()
sample_size = prov['purchase_id'].sum()
prov = prov.drop('purchase_id', axis = 1).pivot_table(index = 'category_cleaned', values = '%', columns = 'category_cleaned2')#.unstack(level=0).T.reset_index()
# reorder categories
first_purchase_cat = list(prov.columns)
first_purchase_cat.remove('Others')
first_purchase_cat.append('Others')
prov = prov[first_purchase_cat]
prov = prov.reindex(index = first_purchase_cat)
fig, ax = plt.subplots(figsize=(20, 20))

# # color map
# cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
# plot heatmap
sns.heatmap(prov,annot=True, fmt=".1%", 
           linewidths=5,
           cbar_kws={"shrink": .5, 'format': PercentFormatter(1)}, square=True, ax=ax)
# ticks
yticks = [i.upper() for i in prov.index]
xticks = [i.upper() for i in prov.columns]
plt.title(f"Reorder Rate (%) | Repurchases: {sample_size:,}")
plt.yticks(plt.yticks()[0], labels=yticks, rotation=0, fontsize = 15)
plt.xticks(plt.xticks()[0], labels=xticks, rotation=90, fontsize = 15)
plt.ylabel('1st purchase')
plt.xlabel('2nd purchase')

plt.show()

In [None]:
# set df
prov = df3[df3['category_cleaned'] != 'SPARKLE']
other_cat = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'][10::])
for n in other_cat:
    prov.loc[prov['category_cleaned'] == n, 'category_cleaned'] = 'Others'
# set df
# get rank of purchases
prov['category_cleaned2'] = prov.sort_values('purchase_release_datetime').groupby('user_buyer_id')['category_cleaned'].shift(-1)
prov = prov[prov['category_cleaned2'].notnull()]
prov = prov[['category_cleaned', 'category_cleaned2', 'purchase_id']].groupby(['category_cleaned', 'category_cleaned2']).count().reset_index()
prov['%'] = prov['purchase_id'] / prov['purchase_id'].sum()
sample_size = prov['purchase_id'].sum()
prov = prov.drop('purchase_id', axis = 1).pivot_table(index = 'category_cleaned', values = '%', columns = 'category_cleaned2')#.unstack(level=0).T.reset_index()
# reorder categories
first_purchase_cat = list(prov.columns)
first_purchase_cat.remove('Others')
first_purchase_cat.append('Others')
prov = prov[first_purchase_cat]
prov = prov.reindex(index = first_purchase_cat)

In [None]:

fig, ax = plt.subplots(figsize=(20, 20))

# # color map
# cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
# plot heatmap
sns.heatmap(prov,annot=True, fmt=".1%", 
           linewidths=5,
           cbar_kws={"shrink": .5, 'format': PercentFormatter(1)}, square=True, ax=ax)
# ticks
yticks = list(prov.index)
xticks = list(prov.columns)
ax.set_title(f"Product Category Matrix\n" + "Reorder Rate (%)", color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=60)
# plt.title(f"Reorder Rate (%) | Repurchases: {sample_size:,}")
plt.yticks(plt.yticks()[0], labels=yticks, rotation=0, fontsize = 20, color =  '#707780')
plt.xticks(plt.xticks()[0], labels=xticks, rotation=90, fontsize = 20, color =  '#707780')
plt.ylabel('1st purchase', color =  '#707780' , fontsize = 20)
plt.xlabel('2nd purchase', color =  '#707780' , fontsize = 20)
plt.show()

#### Customer Country (TOP 10 Categories within TOP 5 Countries - Transaction Volume)

In [None]:
num = 0 
# get top 5 countries (buyer country)
top = list(df3[['user_country', 'purchase_id']].groupby('user_country').count().reset_index().sort_values('purchase_id', ascending=False)['user_country'][0:5])
# set df
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['user_country'] == top[num])]
# get top 10 categories
other_cat = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'][10::])

for n in other_cat:
    prov.loc[prov['category_cleaned'] == n, 'category_cleaned'] = 'Others'
# set df
# get rank of purchases
prov['category_cleaned2'] = prov.sort_values('purchase_release_datetime').groupby('user_buyer_id')['category_cleaned'].shift(-1)
prov = prov[prov['category_cleaned2'].notnull()]
prov = prov[['category_cleaned', 'category_cleaned2', 'purchase_id']].groupby(['category_cleaned', 'category_cleaned2']).count().reset_index()
prov['%'] = prov['purchase_id'] / prov['purchase_id'].sum()
prov = prov.drop('purchase_id', axis = 1).pivot_table(index = 'category_cleaned', values = '%', columns = 'category_cleaned2').fillna(0)#.unstack(level=0).T.reset_index()
# reorder categories
first_purchase_cat = list(prov.columns)
first_purchase_cat.remove('Others')
first_purchase_cat.append('Others')
prov = prov[first_purchase_cat]
prov = prov.reindex(index = first_purchase_cat)
fig, ax = plt.subplots(figsize=(20, 20))

# # color map
# cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
# plot heatmap
sns.heatmap(prov,annot=True, fmt=".1%", 
           linewidths=5,
           cbar_kws={"shrink": .5, 'format': PercentFormatter(1)}, square=True, ax=ax)
# ticks
yticks = [i.upper() for i in prov.index]
xticks = [i.upper() for i in prov.columns]
plt.title(f"{top[num]} | Reorder Rate (%)")
plt.yticks(plt.yticks()[0], labels=yticks, rotation=0, fontsize = 15)
plt.xticks(plt.xticks()[0], labels=xticks, rotation=90, fontsize = 15)
plt.ylabel('1st purchase')
plt.xlabel('2nd purchase')

plt.show()

In [None]:
num = 1
# get top 5 countries (buyer country)
top = list(df3[['user_country', 'purchase_id']].groupby('user_country').count().reset_index().sort_values('purchase_id', ascending=False)['user_country'][0:5])
# set df
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['user_country'] == top[num])]
# get top 10 categories
other_cat = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'][10::])

for n in other_cat:
    prov.loc[prov['category_cleaned'] == n, 'category_cleaned'] = 'Others'
# set df
# get rank of purchases
prov['category_cleaned2'] = prov.sort_values('purchase_release_datetime').groupby('user_buyer_id')['category_cleaned'].shift(-1)
prov = prov[prov['category_cleaned2'].notnull()]
prov = prov[['category_cleaned', 'category_cleaned2', 'purchase_id']].groupby(['category_cleaned', 'category_cleaned2']).count().reset_index()
prov['%'] = prov['purchase_id'] / prov['purchase_id'].sum()
prov = prov.drop('purchase_id', axis = 1).pivot_table(index = 'category_cleaned', values = '%', columns = 'category_cleaned2').fillna(0)#.unstack(level=0).T.reset_index()
# reorder categories
first_purchase_cat = list(prov.columns)
first_purchase_cat.remove('Others')
first_purchase_cat.append('Others')
prov = prov[first_purchase_cat]
prov = prov.reindex(index = first_purchase_cat)
fig, ax = plt.subplots(figsize=(20, 20))

# # color map
# cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
# plot heatmap
sns.heatmap(prov,annot=True, fmt=".1%", 
           linewidths=5,
           cbar_kws={"shrink": .5, 'format': PercentFormatter(1)}, square=True, ax=ax)
# ticks
yticks = [i.upper() for i in prov.index]
xticks = [i.upper() for i in prov.columns]
plt.title(f"{top[num]} | Reorder Rate (%)")
plt.yticks(plt.yticks()[0], labels=yticks, rotation=0, fontsize = 15)
plt.xticks(plt.xticks()[0], labels=xticks, rotation=90, fontsize = 15)
plt.ylabel('1st purchase')
plt.xlabel('2nd purchase')

plt.show()

In [None]:
num = 2
# get top 5 countries (buyer country)
top = list(df3[['user_country', 'purchase_id']].groupby('user_country').count().reset_index().sort_values('purchase_id', ascending=False)['user_country'][0:5])
# set df
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['user_country'] == top[num])]
# get top 10 categories
other_cat = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'][10::])

for n in other_cat:
    prov.loc[prov['category_cleaned'] == n, 'category_cleaned'] = 'Others'
# set df
# get rank of purchases
prov['category_cleaned2'] = prov.sort_values('purchase_release_datetime').groupby('user_buyer_id')['category_cleaned'].shift(-1)
prov = prov[prov['category_cleaned2'].notnull()]
prov = prov[['category_cleaned', 'category_cleaned2', 'purchase_id']].groupby(['category_cleaned', 'category_cleaned2']).count().reset_index()
prov['%'] = prov['purchase_id'] / prov['purchase_id'].sum()
prov = prov.drop('purchase_id', axis = 1).pivot_table(index = 'category_cleaned', values = '%', columns = 'category_cleaned2').fillna(0)#.unstack(level=0).T.reset_index()
# reorder categories
first_purchase_cat = list(prov.columns)
first_purchase_cat.remove('Others')
first_purchase_cat.append('Others')
prov = prov[first_purchase_cat]
prov = prov.reindex(index = first_purchase_cat)
fig, ax = plt.subplots(figsize=(20, 20))

# # color map
# cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
# plot heatmap
sns.heatmap(prov,annot=True, fmt=".1%", 
           linewidths=5,
           cbar_kws={"shrink": .5, 'format': PercentFormatter(1)}, square=True, ax=ax)
# ticks
yticks = [i.upper() for i in prov.index]
xticks = [i.upper() for i in prov.columns]
plt.title(f"{top[num]} | Reorder Rate (%)")
plt.yticks(plt.yticks()[0], labels=yticks, rotation=0, fontsize = 15)
plt.xticks(plt.xticks()[0], labels=xticks, rotation=90, fontsize = 15)
plt.ylabel('1st purchase')
plt.xlabel('2nd purchase')

plt.show()

In [None]:
num = 3
# get top 5 countries (buyer country)
top = list(df3[['user_country', 'purchase_id']].groupby('user_country').count().reset_index().sort_values('purchase_id', ascending=False)['user_country'][0:5])
# set df
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['user_country'] == top[num])]
# get top 10 categories
other_cat = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'][10::])

for n in other_cat:
    prov.loc[prov['category_cleaned'] == n, 'category_cleaned'] = 'Others'
# set df
# get rank of purchases
prov['category_cleaned2'] = prov.sort_values('purchase_release_datetime').groupby('user_buyer_id')['category_cleaned'].shift(-1)
prov = prov[prov['category_cleaned2'].notnull()]
prov = prov[['category_cleaned', 'category_cleaned2', 'purchase_id']].groupby(['category_cleaned', 'category_cleaned2']).count().reset_index()
prov['%'] = prov['purchase_id'] / prov['purchase_id'].sum()
prov = prov.drop('purchase_id', axis = 1).pivot_table(index = 'category_cleaned', values = '%', columns = 'category_cleaned2').fillna(0)#.unstack(level=0).T.reset_index()
# reorder categories
first_purchase_cat = list(prov.columns)
first_purchase_cat.remove('Others')
first_purchase_cat.append('Others')
prov = prov[first_purchase_cat]
prov = prov.reindex(index = first_purchase_cat)
fig, ax = plt.subplots(figsize=(20, 20))

# # color map
# cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
# plot heatmap
sns.heatmap(prov,annot=True, fmt=".1%", 
           linewidths=5,
           cbar_kws={"shrink": .5, 'format': PercentFormatter(1)}, square=True, ax=ax)
# ticks
yticks = [i.upper() for i in prov.index]
xticks = [i.upper() for i in prov.columns]
plt.title(f"{top[num]} | Reorder Rate (%)")
plt.yticks(plt.yticks()[0], labels=yticks, rotation=0, fontsize = 15)
plt.xticks(plt.xticks()[0], labels=xticks, rotation=90, fontsize = 15)
plt.ylabel('1st purchase')
plt.xlabel('2nd purchase')

plt.show()

In [None]:
num = 4
# get top 5 countries (buyer country)
top = list(df3[['user_country', 'purchase_id']].groupby('user_country').count().reset_index().sort_values('purchase_id', ascending=False)['user_country'][0:5])
# set df
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['user_country'] == top[num])]
# get top 10 categories
other_cat = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'][10::])

for n in other_cat:
    prov.loc[prov['category_cleaned'] == n, 'category_cleaned'] = 'Others'
# set df
# get rank of purchases
prov['category_cleaned2'] = prov.sort_values('purchase_release_datetime').groupby('user_buyer_id')['category_cleaned'].shift(-1)
prov = prov[prov['category_cleaned2'].notnull()]
prov = prov[['category_cleaned', 'category_cleaned2', 'purchase_id']].groupby(['category_cleaned', 'category_cleaned2']).count().reset_index()
prov['%'] = prov['purchase_id'] / prov['purchase_id'].sum()
prov = prov.drop('purchase_id', axis = 1).pivot_table(index = 'category_cleaned', values = '%', columns = 'category_cleaned2').fillna(0)#.unstack(level=0).T.reset_index()
# reorder categories
first_purchase_cat = list(prov.columns)
first_purchase_cat.remove('Others')
first_purchase_cat.append('Others')
prov = prov[first_purchase_cat]
prov = prov.reindex(index = first_purchase_cat)
fig, ax = plt.subplots(figsize=(20, 20))

# # color map
# cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
# plot heatmap
sns.heatmap(prov,annot=True, fmt=".1%", 
           linewidths=5,
           cbar_kws={"shrink": .5, 'format': PercentFormatter(1)}, square=True, ax=ax)
# ticks
yticks = [i.upper() for i in prov.index]
xticks = [i.upper() for i in prov.columns]
plt.title(f"{top[num]} | Reorder Rate (%)")
plt.yticks(plt.yticks()[0], labels=yticks, rotation=0, fontsize = 15)
plt.xticks(plt.xticks()[0], labels=xticks, rotation=90, fontsize = 15)
plt.ylabel('1st purchase')
plt.xlabel('2nd purchase')

plt.show()

#### Producer Office (TOP 10 Categories - Transaction Volume)

- For this analysis, purchases can be related to any producer within the same office. 
- Premise: if a 2nd purchase was related to a producer from a different office, this purchase will be discarded. If the 3rd+ purchase comes from the same producer office, then it will be considered as the 2nd purchase.

In [None]:
num = 0 
# get top 5 countries (buyer country)
top = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])
# set df
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['user_office_name'] == top[num])]
# get top 10 categories
other_cat = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'][10::])

for n in other_cat:
    prov.loc[prov['category_cleaned'] == n, 'category_cleaned'] = 'Others'
# set df
# get rank of purchases
prov['category_cleaned2'] = prov.sort_values('purchase_release_datetime').groupby('user_buyer_id')['category_cleaned'].shift(-1)
prov = prov[prov['category_cleaned2'].notnull()]
prov = prov[['category_cleaned', 'category_cleaned2', 'purchase_id']].groupby(['category_cleaned', 'category_cleaned2']).count().reset_index()
prov['%'] = prov['purchase_id'] / prov['purchase_id'].sum()
sample_size = prov['purchase_id'].sum()
prov = prov.drop('purchase_id', axis = 1).pivot_table(index = 'category_cleaned', values = '%', columns = 'category_cleaned2').fillna(0)#.unstack(level=0).T.reset_index()
# reorder categories
first_purchase_cat = list(prov.columns)
first_purchase_cat.remove('Others')
first_purchase_cat.append('Others')
prov = prov[first_purchase_cat]
prov = prov.reindex(index = first_purchase_cat)
fig, ax = plt.subplots(figsize=(20, 20))

# # color map
# cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
# plot heatmap
sns.heatmap(prov,annot=True, fmt=".1%", 
           linewidths=5,
           cbar_kws={"shrink": .5, 'format': PercentFormatter(1)}, square=True, ax=ax)
# ticks
yticks = [i.upper() for i in prov.index]
xticks = [i.upper() for i in prov.columns]
plt.title(f"{top[num]} | Repurchases: {sample_size:,}")

plt.yticks(plt.yticks()[0], labels=yticks, rotation=0, fontsize = 15)
plt.xticks(plt.xticks()[0], labels=xticks, rotation=90, fontsize = 15)
plt.ylabel('1st purchase')
plt.xlabel('2nd purchase')

plt.show()

In [None]:
num = 1
# get top 5 countries (buyer country)
top = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])
# set df
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['user_office_name'] == top[num])]
# get top 10 categories
other_cat = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'][10::])

for n in other_cat:
    prov.loc[prov['category_cleaned'] == n, 'category_cleaned'] = 'Others'
# set df
# get rank of purchases
prov['category_cleaned2'] = prov.sort_values('purchase_release_datetime').groupby('user_buyer_id')['category_cleaned'].shift(-1)
prov = prov[prov['category_cleaned2'].notnull()]
prov = prov[['category_cleaned', 'category_cleaned2', 'purchase_id']].groupby(['category_cleaned', 'category_cleaned2']).count().reset_index()
prov['%'] = prov['purchase_id'] / prov['purchase_id'].sum()
sample_size = prov['purchase_id'].sum()
prov = prov.drop('purchase_id', axis = 1).pivot_table(index = 'category_cleaned', values = '%', columns = 'category_cleaned2').fillna(0)#.unstack(level=0).T.reset_index()
# reorder categories
first_purchase_cat = list(prov.columns)
first_purchase_cat.remove('Others')
first_purchase_cat.append('Others')
prov = prov[first_purchase_cat]
prov = prov.reindex(index = first_purchase_cat)
fig, ax = plt.subplots(figsize=(20, 20))

# # color map
# cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
# plot heatmap
sns.heatmap(prov,annot=True, fmt=".1%", 
           linewidths=5,
           cbar_kws={"shrink": .5, 'format': PercentFormatter(1)}, square=True, ax=ax)
# ticks
yticks = [i.upper() for i in prov.index]
xticks = [i.upper() for i in prov.columns]
plt.title(f"{top[num]} | Repurchases: {sample_size:,}")
plt.yticks(plt.yticks()[0], labels=yticks, rotation=0, fontsize = 15)
plt.xticks(plt.xticks()[0], labels=xticks, rotation=90, fontsize = 15)
plt.ylabel('1st purchase')
plt.xlabel('2nd purchase')

plt.show()

In [None]:
num = 2
# get top 5 countries (buyer country)
top = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])
# set df
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['user_office_name'] == top[num])]
# get top 10 categories
other_cat = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'][10::])

for n in other_cat:
    prov.loc[prov['category_cleaned'] == n, 'category_cleaned'] = 'Others'
# set df
# get rank of purchases
prov['category_cleaned2'] = prov.sort_values('purchase_release_datetime').groupby('user_buyer_id')['category_cleaned'].shift(-1)
prov = prov[prov['category_cleaned2'].notnull()]
prov = prov[['category_cleaned', 'category_cleaned2', 'purchase_id']].groupby(['category_cleaned', 'category_cleaned2']).count().reset_index()
prov['%'] = prov['purchase_id'] / prov['purchase_id'].sum()
sample_size = prov['purchase_id'].sum()
prov = prov.drop('purchase_id', axis = 1).pivot_table(index = 'category_cleaned', values = '%', columns = 'category_cleaned2').fillna(0)#.unstack(level=0).T.reset_index()
# reorder categories
first_purchase_cat = list(prov.columns)
first_purchase_cat.remove('Others')
first_purchase_cat.append('Others')
prov = prov[first_purchase_cat]
prov = prov.reindex(index = first_purchase_cat)
fig, ax = plt.subplots(figsize=(20, 20))

# # color map
# cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
# plot heatmap
sns.heatmap(prov,annot=True, fmt=".1%", 
           linewidths=5,
           cbar_kws={"shrink": .5, 'format': PercentFormatter(1)}, square=True, ax=ax)
# ticks
yticks = [i.upper() for i in prov.index]
xticks = [i.upper() for i in prov.columns]
plt.title(f"{top[num]} | Repurchases: {sample_size:,}")
plt.yticks(plt.yticks()[0], labels=yticks, rotation=0, fontsize = 15)
plt.xticks(plt.xticks()[0], labels=xticks, rotation=90, fontsize = 15)
plt.ylabel('1st purchase')
plt.xlabel('2nd purchase')

plt.show()

In [None]:
num = 3
# get top 5 countries (buyer country)
top = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])
# set df
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['user_office_name'] == top[num])]
# get top 10 categories
other_cat = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'][10::])

for n in other_cat:
    prov.loc[prov['category_cleaned'] == n, 'category_cleaned'] = 'Others'
# set df
# get rank of purchases
prov['category_cleaned2'] = prov.sort_values('purchase_release_datetime').groupby('user_buyer_id')['category_cleaned'].shift(-1)
prov = prov[prov['category_cleaned2'].notnull()]
prov = prov[['category_cleaned', 'category_cleaned2', 'purchase_id']].groupby(['category_cleaned', 'category_cleaned2']).count().reset_index()
prov['%'] = prov['purchase_id'] / prov['purchase_id'].sum()
sample_size = prov['purchase_id'].sum()
prov = prov.drop('purchase_id', axis = 1).pivot_table(index = 'category_cleaned', values = '%', columns = 'category_cleaned2').fillna(0)#.unstack(level=0).T.reset_index()
# reorder categories
first_purchase_cat = list(prov.columns)
first_purchase_cat.remove('Others')
first_purchase_cat.append('Others')
prov = prov[first_purchase_cat]
prov = prov.reindex(index = first_purchase_cat)
fig, ax = plt.subplots(figsize=(20, 20))

# # color map
# cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
# plot heatmap
sns.heatmap(prov,annot=True, fmt=".1%", 
           linewidths=5,
           cbar_kws={"shrink": .5, 'format': PercentFormatter(1)}, square=True, ax=ax)
# ticks
yticks = [i.upper() for i in prov.index]
xticks = [i.upper() for i in prov.columns]
plt.title(f"{top[num]} | Repurchases: {sample_size:,}")
plt.yticks(plt.yticks()[0], labels=yticks, rotation=0, fontsize = 15)
plt.xticks(plt.xticks()[0], labels=xticks, rotation=90, fontsize = 15)
plt.ylabel('1st purchase')
plt.xlabel('2nd purchase')

plt.show()

In [None]:
num = 4
# get top 5 countries (buyer country)
top = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])
# set df
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['user_office_name'] == top[num])]
# get top 10 categories
other_cat = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'][10::])

for n in other_cat:
    prov.loc[prov['category_cleaned'] == n, 'category_cleaned'] = 'Others'
# set df
# get rank of purchases
prov['category_cleaned2'] = prov.sort_values('purchase_release_datetime').groupby('user_buyer_id')['category_cleaned'].shift(-1)
prov = prov[prov['category_cleaned2'].notnull()]
prov = prov[['category_cleaned', 'category_cleaned2', 'purchase_id']].groupby(['category_cleaned', 'category_cleaned2']).count().reset_index()
prov['%'] = prov['purchase_id'] / prov['purchase_id'].sum()
sample_size = prov['purchase_id'].sum()
prov = prov.drop('purchase_id', axis = 1).pivot_table(index = 'category_cleaned', values = '%', columns = 'category_cleaned2').fillna(0)#.unstack(level=0).T.reset_index()
# reorder categories
first_purchase_cat = list(prov.columns)
first_purchase_cat.remove('Others')
first_purchase_cat.append('Others')
prov = prov[first_purchase_cat]
prov = prov.reindex(index = first_purchase_cat)
fig, ax = plt.subplots(figsize=(20, 20))

# # color map
# cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
# plot heatmap
sns.heatmap(prov,annot=True, fmt=".1%", 
           linewidths=5,
           cbar_kws={"shrink": .5, 'format': PercentFormatter(1)}, square=True, ax=ax)
# ticks
yticks = [i.upper() for i in prov.index]
xticks = [i.upper() for i in prov.columns]
plt.title(f"{top[num]} | Repurchases: {sample_size:,}")
plt.yticks(plt.yticks()[0], labels=yticks, rotation=0, fontsize = 15)
plt.xticks(plt.xticks()[0], labels=xticks, rotation=90, fontsize = 15)
plt.ylabel('1st purchase')
plt.xlabel('2nd purchase')

plt.show()

In [None]:
num = 5
# get top 5 countries (buyer country)
top = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])
# set df
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['user_office_name'] == top[num])]
# get top 10 categories
other_cat = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'][10::])

for n in other_cat:
    prov.loc[prov['category_cleaned'] == n, 'category_cleaned'] = 'Others'
# set df
# get rank of purchases
prov['category_cleaned2'] = prov.sort_values('purchase_release_datetime').groupby('user_buyer_id')['category_cleaned'].shift(-1)
prov = prov[prov['category_cleaned2'].notnull()]
prov = prov[['category_cleaned', 'category_cleaned2', 'purchase_id']].groupby(['category_cleaned', 'category_cleaned2']).count().reset_index()
prov['%'] = prov['purchase_id'] / prov['purchase_id'].sum()
sample_size = prov['purchase_id'].sum()
prov = prov.drop('purchase_id', axis = 1).pivot_table(index = 'category_cleaned', values = '%', columns = 'category_cleaned2').fillna(0)#.unstack(level=0).T.reset_index()
# reorder categories
first_purchase_cat = list(prov.columns)
first_purchase_cat.remove('Others')
first_purchase_cat.append('Others')
prov = prov[first_purchase_cat]
prov = prov.reindex(index = first_purchase_cat)
fig, ax = plt.subplots(figsize=(20, 20))

# # color map
# cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
# plot heatmap
sns.heatmap(prov,annot=True, fmt=".1%", 
           linewidths=5,
           cbar_kws={"shrink": .5, 'format': PercentFormatter(1)}, square=True, ax=ax)
# ticks
yticks = [i.upper() for i in prov.index]
xticks = [i.upper() for i in prov.columns]
plt.title(f"{top[num]} | Repurchases: {sample_size:,}")
plt.yticks(plt.yticks()[0], labels=yticks, rotation=0, fontsize = 15)
plt.xticks(plt.xticks()[0], labels=xticks, rotation=90, fontsize = 15)
plt.ylabel('1st purchase')
plt.xlabel('2nd purchase')

plt.show()

#### Producer Segment (TOP 10 Categories - Transaction Volume)

- For this analysis, 1st and 2nd purchases can be related to any producer within the same segment. 
- Premise: if a 2nd purchase was related to a producer from a different segment, this purchase will be discarded. If the 3rd+ purchase comes from the same producer segment, then it will be considered as the 2nd purchase.

In [None]:
segment = 'SEED'
# set df
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['segment'] == segment)]
# get top 10 categories
other_cat = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'][10::])

for n in other_cat:
    prov.loc[prov['category_cleaned'] == n, 'category_cleaned'] = 'Others'
# set df
# get rank of purchases
prov['category_cleaned2'] = prov.sort_values('purchase_release_datetime').groupby('user_buyer_id')['category_cleaned'].shift(-1)
prov = prov[prov['category_cleaned2'].notnull()]
prov = prov[['category_cleaned', 'category_cleaned2', 'purchase_id']].groupby(['category_cleaned', 'category_cleaned2']).count().reset_index()
prov['%'] = prov['purchase_id'] / prov['purchase_id'].sum()
sample_size = prov['purchase_id'].sum()
prov = prov.drop('purchase_id', axis = 1).pivot_table(index = 'category_cleaned', values = '%', columns = 'category_cleaned2').fillna(0)#.unstack(level=0).T.reset_index()
# reorder categories
first_purchase_cat = list(prov.columns)
first_purchase_cat.remove('Others')
first_purchase_cat.append('Others')
prov = prov[first_purchase_cat]
prov = prov.reindex(index = first_purchase_cat)
fig, ax = plt.subplots(figsize=(20, 20))

# # color map
# cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
# plot heatmap
sns.heatmap(prov,annot=True, fmt=".1%", 
           linewidths=5,
           cbar_kws={"shrink": .5, 'format': PercentFormatter(1)}, square=True, ax=ax)
# ticks
yticks = [i.upper() for i in prov.index]
xticks = [i.upper() for i in prov.columns]
plt.title(f"{segment} | Repurchases: {sample_size:,}")

plt.yticks(plt.yticks()[0], labels=yticks, rotation=0, fontsize = 15)
plt.xticks(plt.xticks()[0], labels=xticks, rotation=90, fontsize = 15)
plt.ylabel('1st purchase')
plt.xlabel('2nd purchase')

plt.show()

In [None]:
segment = 'SMALL'
# set df
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['segment'] == segment)]
# get top 10 categories
other_cat = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'][10::])

for n in other_cat:
    prov.loc[prov['category_cleaned'] == n, 'category_cleaned'] = 'Others'
# set df
# get rank of purchases
prov['category_cleaned2'] = prov.sort_values('purchase_release_datetime').groupby('user_buyer_id')['category_cleaned'].shift(-1)
prov = prov[prov['category_cleaned2'].notnull()]
prov = prov[['category_cleaned', 'category_cleaned2', 'purchase_id']].groupby(['category_cleaned', 'category_cleaned2']).count().reset_index()
prov['%'] = prov['purchase_id'] / prov['purchase_id'].sum()
sample_size = prov['purchase_id'].sum()
prov = prov.drop('purchase_id', axis = 1).pivot_table(index = 'category_cleaned', values = '%', columns = 'category_cleaned2').fillna(0)#.unstack(level=0).T.reset_index()
# reorder categories
first_purchase_cat = list(prov.columns)
first_purchase_cat.remove('Others')
first_purchase_cat.append('Others')
prov = prov[first_purchase_cat]
prov = prov.reindex(index = first_purchase_cat)
fig, ax = plt.subplots(figsize=(20, 20))

# # color map
# cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
# plot heatmap
sns.heatmap(prov,annot=True, fmt=".1%", 
           linewidths=5,
           cbar_kws={"shrink": .5, 'format': PercentFormatter(1)}, square=True, ax=ax)
# ticks
yticks = [i.upper() for i in prov.index]
xticks = [i.upper() for i in prov.columns]
plt.title(f"{segment} | Repurchases: {sample_size:,}")
plt.yticks(plt.yticks()[0], labels=yticks, rotation=0, fontsize = 15)
plt.xticks(plt.xticks()[0], labels=xticks, rotation=90, fontsize = 15)
plt.ylabel('1st purchase')
plt.xlabel('2nd purchase')

plt.show()

In [None]:
segment = 'MEDIUM'
# set df
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['segment'] == segment)]
# get top 10 categories
other_cat = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'][10::])

for n in other_cat:
    prov.loc[prov['category_cleaned'] == n, 'category_cleaned'] = 'Others'
# set df
# get rank of purchases
prov['category_cleaned2'] = prov.sort_values('purchase_release_datetime').groupby('user_buyer_id')['category_cleaned'].shift(-1)
prov = prov[prov['category_cleaned2'].notnull()]
prov = prov[['category_cleaned', 'category_cleaned2', 'purchase_id']].groupby(['category_cleaned', 'category_cleaned2']).count().reset_index()
prov['%'] = prov['purchase_id'] / prov['purchase_id'].sum()
sample_size = prov['purchase_id'].sum()
prov = prov.drop('purchase_id', axis = 1).pivot_table(index = 'category_cleaned', values = '%', columns = 'category_cleaned2').fillna(0)#.unstack(level=0).T.reset_index()
# reorder categories
first_purchase_cat = list(prov.columns)
first_purchase_cat.remove('Others')
first_purchase_cat.append('Others')
prov = prov[first_purchase_cat]
prov = prov.reindex(index = first_purchase_cat)
fig, ax = plt.subplots(figsize=(20, 20))

# # color map
# cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
# plot heatmap
sns.heatmap(prov,annot=True, fmt=".1%", 
           linewidths=5,
           cbar_kws={"shrink": .5, 'format': PercentFormatter(1)}, square=True, ax=ax)
# ticks
yticks = [i.upper() for i in prov.index]
xticks = [i.upper() for i in prov.columns]
plt.title(f"{segment} | Repurchases: {sample_size:,}")
plt.yticks(plt.yticks()[0], labels=yticks, rotation=0, fontsize = 15)
plt.xticks(plt.xticks()[0], labels=xticks, rotation=90, fontsize = 15)
plt.ylabel('1st purchase')
plt.xlabel('2nd purchase')

plt.show()

In [None]:
segment = 'LARGE'
# set df
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['segment'] == segment)]
# get top 10 categories
other_cat = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'][10::])

for n in other_cat:
    prov.loc[prov['category_cleaned'] == n, 'category_cleaned'] = 'Others'
# set df
# get rank of purchases
prov['category_cleaned2'] = prov.sort_values('purchase_release_datetime').groupby('user_buyer_id')['category_cleaned'].shift(-1)
prov = prov[prov['category_cleaned2'].notnull()]
prov = prov[['category_cleaned', 'category_cleaned2', 'purchase_id']].groupby(['category_cleaned', 'category_cleaned2']).count().reset_index()
prov['%'] = prov['purchase_id'] / prov['purchase_id'].sum()
sample_size = prov['purchase_id'].sum()
prov = prov.drop('purchase_id', axis = 1).pivot_table(index = 'category_cleaned', values = '%', columns = 'category_cleaned2').fillna(0)#.unstack(level=0).T.reset_index()
# reorder categories
first_purchase_cat = list(prov.columns)
first_purchase_cat.remove('Others')
first_purchase_cat.append('Others')
prov = prov[first_purchase_cat]
prov = prov.reindex(index = first_purchase_cat)
fig, ax = plt.subplots(figsize=(20, 20))

# # color map
# cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
# plot heatmap
sns.heatmap(prov,annot=True, fmt=".1%", 
           linewidths=5,
           cbar_kws={"shrink": .5, 'format': PercentFormatter(1)}, square=True, ax=ax)
# ticks
yticks = [i.upper() for i in prov.index]
xticks = [i.upper() for i in prov.columns]
plt.title(f"{segment} | Repurchases: {sample_size:,}")
plt.yticks(plt.yticks()[0], labels=yticks, rotation=0, fontsize = 15)
plt.xticks(plt.xticks()[0], labels=xticks, rotation=90, fontsize = 15)
plt.ylabel('1st purchase')
plt.xlabel('2nd purchase')

plt.show()

#### Producers < 10k BRL LTV

- For this analysis, 1st and 2nd purchases can be related to any producer within the same threshold. 
- Premise: if a 2nd purchase was related to a producer from a different threshold, this purchase will be discarded. If the 3rd+ purchase comes from the same producer threshold, then it will be considered as the 2nd purchase.

In [None]:
threshold = 'below'

# set df
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['is_below_10_entire'] == threshold)]
# get top 10 categories
other_cat = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'][10::])

for n in other_cat:
    prov.loc[prov['category_cleaned'] == n, 'category_cleaned'] = 'Others'
# set df
# get rank of purchases
prov['category_cleaned2'] = prov.sort_values('purchase_release_datetime').groupby('user_buyer_id')['category_cleaned'].shift(-1)
prov = prov[prov['category_cleaned2'].notnull()]
prov = prov[['category_cleaned', 'category_cleaned2', 'purchase_id']].groupby(['category_cleaned', 'category_cleaned2']).count().reset_index()
prov['%'] = prov['purchase_id'] / prov['purchase_id'].sum()
sample_size = prov['purchase_id'].sum()
prov = prov.drop('purchase_id', axis = 1).pivot_table(index = 'category_cleaned', values = '%', columns = 'category_cleaned2').fillna(0)#.unstack(level=0).T.reset_index()
# reorder categories
first_purchase_cat = list(prov.columns)
first_purchase_cat.remove('Others')
first_purchase_cat.append('Others')
prov = prov[first_purchase_cat]
prov = prov.reindex(index = first_purchase_cat)
fig, ax = plt.subplots(figsize=(20, 20))

# # color map
# cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
# plot heatmap
sns.heatmap(prov,annot=True, fmt=".1%", 
           linewidths=5,
           cbar_kws={"shrink": .5, 'format': PercentFormatter(1)}, square=True, ax=ax)
# ticks
yticks = [i.upper() for i in prov.index]
xticks = [i.upper() for i in prov.columns]
plt.title(f"{threshold} | Repurchases: {sample_size:,}")

plt.yticks(plt.yticks()[0], labels=yticks, rotation=0, fontsize = 15)
plt.xticks(plt.xticks()[0], labels=xticks, rotation=90, fontsize = 15)
plt.ylabel('1st purchase')
plt.xlabel('2nd purchase')

plt.show()

In [None]:
threshold = 'above'

# set df
prov = df3[(df3['category_cleaned'] != 'SPARKLE') & (df3['is_below_10_entire'] == threshold)]
# get top 10 categories
other_cat = list(prov[['category_cleaned', 'purchase_id']].groupby('category_cleaned').count().reset_index().sort_values('purchase_id', ascending=False)['category_cleaned'][10::])

for n in other_cat:
    prov.loc[prov['category_cleaned'] == n, 'category_cleaned'] = 'Others'
# set df
# get rank of purchases
prov['category_cleaned2'] = prov.sort_values('purchase_release_datetime').groupby('user_buyer_id')['category_cleaned'].shift(-1)
prov = prov[prov['category_cleaned2'].notnull()]
prov = prov[['category_cleaned', 'category_cleaned2', 'purchase_id']].groupby(['category_cleaned', 'category_cleaned2']).count().reset_index()
prov['%'] = prov['purchase_id'] / prov['purchase_id'].sum()
sample_size = prov['purchase_id'].sum()
prov = prov.drop('purchase_id', axis = 1).pivot_table(index = 'category_cleaned', values = '%', columns = 'category_cleaned2').fillna(0)#.unstack(level=0).T.reset_index()
# reorder categories
first_purchase_cat = list(prov.columns)
first_purchase_cat.remove('Others')
first_purchase_cat.append('Others')
prov = prov[first_purchase_cat]
prov = prov.reindex(index = first_purchase_cat)
fig, ax = plt.subplots(figsize=(20, 20))

# # color map
# cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
# plot heatmap
sns.heatmap(prov,annot=True, fmt=".1%", 
           linewidths=5,
           cbar_kws={"shrink": .5, 'format': PercentFormatter(1)}, square=True, ax=ax)
# ticks
yticks = [i.upper() for i in prov.index]
xticks = [i.upper() for i in prov.columns]
plt.title(f"{threshold} | Repurchases: {sample_size:,}")
plt.yticks(plt.yticks()[0], labels=yticks, rotation=0, fontsize = 15)
plt.xticks(plt.xticks()[0], labels=xticks, rotation=90, fontsize = 15)
plt.ylabel('1st purchase')
plt.xlabel('2nd purchase')

plt.show()

### How many customers repurchase within the same product category (but from different producers)? (by customer country, producer office, producers below 10k BRL LTV)

- In this analysis, we considered only 1st and 2nd purchases from a customer. 

#### Overall

In [None]:
# all users who repurchase
aux1 = df3[(df3['repurchase'] == 'repurchase')][['user_buyer_id', 'category', 'producer_id', 'purchase_id', 'product_id', 'order_purchase']]
# get first and 2nd purchase of every user
aux2 = aux1[aux1['order_purchase'] == 1][['user_buyer_id', 'producer_id', 'category']]
aux3 = aux1[aux1['order_purchase'] == 2][['user_buyer_id', 'producer_id', 'category']]
aux3.columns = ['user_buyer_id', 'producer_id2', 'category2']
aux4 = aux3.merge(aux2, how = 'left', on = 'user_buyer_id')
aux4['same_cat'] = aux4.apply(lambda x : 'same_cat' if x['category'] == x['category2'] else 'diff_cat', axis = 1)
aux4['same_prod'] = aux4.apply(lambda x : 'same_prod' if x['producer_id'] == x['producer_id2'] else 'diff_prod', axis = 1)
aux4['answer'] = aux4.apply(lambda x : 'diff_producer_same_cat' if (x['same_cat'] == 'same_cat') & (x['same_prod'] == 'diff_prod') 
                            else 'same_producer_same_cat' if (x['same_cat'] == 'same_cat') & (x['same_prod'] == 'same_prod') 
                            else 'NA'
                            , axis = 1)

# group by filter - users who repurchase at same categories
aux5 = aux4[['user_buyer_id', 'same_cat']].groupby('same_cat').count().reset_index()
aux5['%'] = aux5['user_buyer_id'] / aux5['user_buyer_id'].sum() * 100
# group by filter - hypothesis answer
aux6 = aux4[aux4['same_cat'] == 'same_cat'][['user_buyer_id', 'answer']].groupby('answer').count().reset_index()
aux6['%'] = aux6['user_buyer_id'] / aux6['user_buyer_id'].sum() * 100

# plot graph with annotations
plt.figure(figsize = (10,10))
sns.set_style('white')
splot = sns.barplot(y = 'same_cat', x = '%', data = aux5, palette = palette_types, order = ['diff_cat', 'same_cat']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title("Users who repurchased: same vs different category (%)")
plt.yticks(fontsize =16)
plt.xlabel(f"% (N = {(aux5['user_buyer_id'].sum()):,})")
plt.ylabel("")
plt.xlim(0, 100)
plt.show()

In [None]:
# plot graph with annotations
plt.figure(figsize = (10,10))
sns.set_style('white')
splot = sns.barplot(y = 'answer', x = '%', data = aux6, palette = palette_types, order = ['diff_producer_same_cat', 'same_producer_same_cat']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title("Customers who repurchase within the same product category (but different producers) (%)")
plt.yticks(fontsize =16)
plt.xlabel(f"% (N = {(aux6['user_buyer_id'].sum()):,})")
plt.ylabel("")
plt.xlim(0, 100)
plt.show()

#### Producer Office (Users who repurchased: same vs different category)

- For this analysis, repurchases can be related to any producer within the same office. 
- Premise: if a repurchase was related to a producer from a different office, this purchase will be discarded. 

In [None]:
# get top 10 countries 
office = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])

plt.figure(figsize = (20,15))
for e, f in enumerate(office):
    plt.subplot(2,3,e+1)
    # set df
    aux1 = df3[(df3['user_office_name'] == f)][['user_buyer_id', 'category', 'producer_id', 'purchase_id', 'product_id', 'purchase_order_datetime']]
    # count purchases per user
    aux2 = df3[(df3['user_office_name'] == f)][['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    # flag repurchase
    aux1['repurchase'] = aux1['no_repurchase'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')
    # filter users
    aux1 = aux1[aux1['repurchase'] == 'repurchase']

    # get rank of purchases
    aux1['order_purchase_temp'] = aux1.groupby("user_buyer_id")["purchase_order_datetime"].rank(method="first", ascending=True)

    # get first and 2nd purchase of every user
    aux2 = aux1[aux1['order_purchase_temp'] == 1][['user_buyer_id', 'producer_id', 'category']]
    aux3 = aux1[aux1['order_purchase_temp'] == 2][['user_buyer_id', 'producer_id', 'category']]
    aux3.columns = ['user_buyer_id', 'producer_id2', 'category2']
    aux4 = aux3.merge(aux2, how = 'left', on = 'user_buyer_id')
    aux4['same_cat'] = aux4.apply(lambda x : 'same_cat' if x['category'] == x['category2'] else 'diff_cat', axis = 1)
    aux4['same_prod'] = aux4.apply(lambda x : 'same_prod' if x['producer_id'] == x['producer_id2'] else 'diff_prod', axis = 1)
    aux4['answer'] = aux4.apply(lambda x : 'diff_producer_same_cat' if (x['same_cat'] == 'same_cat') & (x['same_prod'] == 'diff_prod') 
                                else 'same_producer_same_cat' if (x['same_cat'] == 'same_cat') & (x['same_prod'] == 'same_prod') 
                                else 'NA'
                                , axis = 1)
    # group by filter - users who repurchase at same categories
    aux5 = aux4[['user_buyer_id', 'same_cat']].groupby('same_cat').count().reset_index()
    aux5['%'] = aux5['user_buyer_id'] / aux5['user_buyer_id'].sum() * 100
    # group by filter - hypothesis answer
    aux6 = aux4[aux4['same_cat'] == 'same_cat'][['user_buyer_id', 'answer']].groupby('answer').count().reset_index()
    aux6['%'] = aux6['user_buyer_id'] / aux6['user_buyer_id'].sum() * 100
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'same_cat', x = '%', data = aux5, palette = palette_types, order = ['diff_cat', 'same_cat']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{f} | N: {(millify(aux5['user_buyer_id'].sum()))}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 100)
    splot.axes.get_yaxis().set_visible(False)
    if e == 0:
        splot.axes.get_yaxis().set_visible(True)
    if e == 3:
        splot.axes.get_yaxis().set_visible(True)
    #         splot.spines['right'].set_visible(False)
    #         splot.spines['top'].set_visible(False)
    #         splot.spines['left'].set_visible(False)
    #         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
    #         splot.axes.get_yaxis().set_visible(False)
plt.subplots_adjust(hspace=0.1)
plt.subplots_adjust(wspace=0.1)
plt.show()

#### Producer Office (Answer)

- For this analysis, repurchases can be related to any producer within the same office. 
- Premise: if a repurchase was related to a producer from a different office, this purchase will be discarded. 

In [None]:
palette_diff_same_prod = {'different': '#9EA4AC', 'same': '#9EA4AC'} # grey, orange


In [None]:
# get top 10 countries 
office = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])
repurchase = pd.DataFrame()
plt.figure(figsize = (15,20))
for e, f in enumerate(office):
    plt.subplot(3,2,e+1)
    # set df
    aux1 = df3[(df3['user_office_name'] == f)][['user_buyer_id', 'category', 'producer_id', 'purchase_id', 'product_id', 'purchase_order_datetime']]
    # count purchases per user
    aux2 = df3[(df3['user_office_name'] == f)][['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    # flag repurchase
    aux1['repurchase'] = aux1['no_repurchase'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')
    # filter users
    aux1 = aux1[aux1['repurchase'] == 'repurchase']

    # get rank of purchases
    aux1['order_purchase_temp'] = aux1.groupby("user_buyer_id")["purchase_order_datetime"].rank(method="first", ascending=True)

    # get first and 2nd purchase of every user
    aux2 = aux1[aux1['order_purchase_temp'] == 1][['user_buyer_id', 'producer_id', 'category']]
    aux3 = aux1[aux1['order_purchase_temp'] == 2][['user_buyer_id', 'producer_id', 'category']]
    aux3.columns = ['user_buyer_id', 'producer_id2', 'category2']
    aux4 = aux3.merge(aux2, how = 'left', on = 'user_buyer_id')
    aux4['same_cat'] = aux4.apply(lambda x : 'same_cat' if x['category'] == x['category2'] else 'diff_cat', axis = 1)
    aux4['same_prod'] = aux4.apply(lambda x : 'same_prod' if x['producer_id'] == x['producer_id2'] else 'diff_prod', axis = 1)
    aux4['answer'] = aux4.apply(lambda x : 'different' if (x['same_cat'] == 'same_cat') & (x['same_prod'] == 'diff_prod') 
                                else 'same' if (x['same_cat'] == 'same_cat') & (x['same_prod'] == 'same_prod') 
                                else 'NA'
                                , axis = 1)
    # group by filter - users who repurchase at same categories
    aux5 = aux4[['user_buyer_id', 'same_cat']].groupby('same_cat').count().reset_index()
    aux5['%'] = aux5['user_buyer_id'] / aux5['user_buyer_id'].sum() * 100
    # group by filter - hypothesis answer
    aux6 = aux4[aux4['same_cat'] == 'same_cat'][['user_buyer_id', 'answer']].groupby('answer').count().reset_index()
    aux6['%'] = aux6['user_buyer_id'] / aux6['user_buyer_id'].sum() * 100
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'answer', x = '%', data = aux6, palette = palette_diff_same_prod, 
                        order = ['different', 'same']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center', color =  '#707780')
    plt.title(f"{f} | Buyers: {(aux6['user_buyer_id'].sum()):,}", color =  '#707780')
    plt.yticks(fontsize =30)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 120)
    splot.axes.get_yaxis().set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
    if e == 0:
        splot.axes.get_yaxis().set_visible(True)
    if e == 2:
        splot.axes.get_yaxis().set_visible(True)
    if e == 4:
        splot.axes.get_yaxis().set_visible(True)

    splot.yaxis.label.set_color('#707780')
    splot.spines['left'].set_color('#707780')
    splot.spines['right'].set_color('#707780')
    splot.spines['top'].set_color('#707780')
    splot.spines['bottom'].set_color('#707780')
    splot.tick_params(axis='y', colors='#707780',labelsize=30, pad = 20)
plt.subplots_adjust(hspace=0.15)
plt.subplots_adjust(wspace=0.1)
plt.show()

#### Producers < 10k BRL LTV (Users who repurchased: same vs different category)

- For this analysis, repurchases can be related to any producer within the same segment. 
- Premise: if a repurchase was related to a producer from a different segment, this purchase will be discarded. 

In [None]:
# get top 10 countries 
office = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])

plt.figure(figsize = (20,10))
for e, f in enumerate([ 'below', 'above']):
    plt.subplot(1,2,e+1)
    # set df
    aux1 = df3[(df3['is_below_10_entire'] == f)][['user_buyer_id', 'category', 'producer_id', 'purchase_id', 'product_id', 'purchase_order_datetime']]
    # count purchases per user
    aux2 = df3[(df3['is_below_10_entire'] == f)][['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    # flag repurchase
    aux1['repurchase'] = aux1['no_repurchase'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')
    # filter users
    aux1 = aux1[aux1['repurchase'] == 'repurchase']

    # get rank of purchases
    aux1['order_purchase_temp'] = aux1.groupby("user_buyer_id")["purchase_order_datetime"].rank(method="first", ascending=True)

    # get first and 2nd purchase of every user
    aux2 = aux1[aux1['order_purchase_temp'] == 1][['user_buyer_id', 'producer_id', 'category']]
    aux3 = aux1[aux1['order_purchase_temp'] == 2][['user_buyer_id', 'producer_id', 'category']]
    aux3.columns = ['user_buyer_id', 'producer_id2', 'category2']
    aux4 = aux3.merge(aux2, how = 'left', on = 'user_buyer_id')
    aux4['same_cat'] = aux4.apply(lambda x : 'same_cat' if x['category'] == x['category2'] else 'diff_cat', axis = 1)
    aux4['same_prod'] = aux4.apply(lambda x : 'same_prod' if x['producer_id'] == x['producer_id2'] else 'diff_prod', axis = 1)
    aux4['answer'] = aux4.apply(lambda x : 'diff_producer_same_cat' if (x['same_cat'] == 'same_cat') & (x['same_prod'] == 'diff_prod') 
                                else 'same_producer_same_cat' if (x['same_cat'] == 'same_cat') & (x['same_prod'] == 'same_prod') 
                                else 'NA'
                                , axis = 1)
    # group by filter - users who repurchase at same categories
    aux5 = aux4[['user_buyer_id', 'same_cat']].groupby('same_cat').count().reset_index()
    aux5['%'] = aux5['user_buyer_id'] / aux5['user_buyer_id'].sum() * 100
    # group by filter - hypothesis answer
    aux6 = aux4[aux4['same_cat'] == 'same_cat'][['user_buyer_id', 'answer']].groupby('answer').count().reset_index()
    aux6['%'] = aux6['user_buyer_id'] / aux6['user_buyer_id'].sum() * 100
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'same_cat', x = '%', data = aux5, palette = palette_types, order = ['diff_cat', 'same_cat']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{f} | N: {(millify(aux5['user_buyer_id'].sum()))}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 100)
    splot.axes.get_yaxis().set_visible(False)
    if e == 0:
        splot.axes.get_yaxis().set_visible(True)
    #         splot.spines['right'].set_visible(False)
    #         splot.spines['top'].set_visible(False)
    #         splot.spines['left'].set_visible(False)
    #         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
    #         splot.axes.get_yaxis().set_visible(False)
plt.subplots_adjust(hspace=0.1)
plt.subplots_adjust(wspace=0.1)
plt.show()

#### Producers < 10k BRL LTV (Answer)

- For this analysis, repurchases can be related to any producer within the same segment. 
- Premise: if a repurchase was related to a producer from a different segment, this purchase will be discarded. 

In [None]:
# get top 10 countries 
office = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])

plt.figure(figsize = (20,10))
for e, f in enumerate([ 'below', 'above']):
    plt.subplot(1,2,e+1)
    # set df
    aux1 = df3[(df3['is_below_10_entire'] == f)][['user_buyer_id', 'category', 'producer_id', 'purchase_id', 'product_id', 'purchase_order_datetime']]
    # count purchases per user
    aux2 = df3[(df3['is_below_10_entire'] == f)][['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    # flag repurchase
    aux1['repurchase'] = aux1['no_repurchase'].apply(lambda x : 'repurchase' if x != 1 else 'single-purchase')
    # filter users
    aux1 = aux1[aux1['repurchase'] == 'repurchase']

    # get rank of purchases
    aux1['order_purchase_temp'] = aux1.groupby("user_buyer_id")["purchase_order_datetime"].rank(method="first", ascending=True)

    # get first and 2nd purchase of every user
    aux2 = aux1[aux1['order_purchase_temp'] == 1][['user_buyer_id', 'producer_id', 'category']]
    aux3 = aux1[aux1['order_purchase_temp'] == 2][['user_buyer_id', 'producer_id', 'category']]
    aux3.columns = ['user_buyer_id', 'producer_id2', 'category2']
    aux4 = aux3.merge(aux2, how = 'left', on = 'user_buyer_id')
    aux4['same_cat'] = aux4.apply(lambda x : 'same_cat' if x['category'] == x['category2'] else 'diff_cat', axis = 1)
    aux4['same_prod'] = aux4.apply(lambda x : 'same_prod' if x['producer_id'] == x['producer_id2'] else 'diff_prod', axis = 1)
    aux4['answer'] = aux4.apply(lambda x : 'diff_producer_same_cat' if (x['same_cat'] == 'same_cat') & (x['same_prod'] == 'diff_prod') 
                                else 'same_producer_same_cat' if (x['same_cat'] == 'same_cat') & (x['same_prod'] == 'same_prod') 
                                else 'NA'
                                , axis = 1)
    # group by filter - users who repurchase at same categories
    aux5 = aux4[['user_buyer_id', 'same_cat']].groupby('same_cat').count().reset_index()
    aux5['%'] = aux5['user_buyer_id'] / aux5['user_buyer_id'].sum() * 100
    # group by filter - hypothesis answer
    aux6 = aux4[aux4['same_cat'] == 'same_cat'][['user_buyer_id', 'answer']].groupby('answer').count().reset_index()
    aux6['%'] = aux6['user_buyer_id'] / aux6['user_buyer_id'].sum() * 100
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'answer', x = '%', data = aux6, palette = palette_types, order = ['diff_producer_same_cat', 'same_producer_same_cat']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{f} | N: {(millify(aux6['user_buyer_id'].sum()))}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 100)
    splot.axes.get_yaxis().set_visible(False)
    if e == 0:
        splot.axes.get_yaxis().set_visible(True)
    #         splot.spines['right'].set_visible(False)
    #         splot.spines['top'].set_visible(False)
    #         splot.spines['left'].set_visible(False)
    #         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
    #         splot.axes.get_yaxis().set_visible(False)
plt.subplots_adjust(hspace=0.1)
plt.subplots_adjust(wspace=0.1)
plt.show()

### Do customers who repurchase a 2nd time complete the 1st purchase (Club)? What is the completion rate (% of modules completed)?

- Time is not accounted in this analysis: progress rates come from the time the query was performed.
- Only repurchases in which 1st transaction was a club product were considered
- if user has completed a course, progress will be considered as 100% (even if producer adds more modules - which affects progress rate)
- if user has progressed 85% or more, course will be considered as completed.
- Repurchases with too many NAs across club data were discarded.
- Duplicate repurchases were removed (same product_id)

#### Data Cleaning

In [None]:
aux1 = df3[df3['is_club'] == 'club'][['user_buyer_id','product_id', 'purchase_id', 'join_date', 'progress_club','is_course_complete', 'completion_course_date', 'join_course_date', 'membership_status', 'last_access', 'purchase_order_datetime']]
# aux1.isna().sum() / len(aux1) * 100

In [None]:
# transform ms to datetime
aux1.columns = ['user_buyer_id', 'product_id', 'purchase_id', 'join_date_elastic',
       'progress_club_elastic', 'is_course_complete_club', 'completion_course_date', 'join_course_date',
       'membership_status_club', 'last_access', 'purchase_order_datetime']
aux1['join_course_date_club'] = pd.to_datetime(aux1['join_course_date'], unit='ms')
aux1['completion_course_date_club'] = pd.to_datetime(aux1['completion_course_date'], unit='ms')
aux1['last_access_club'] = pd.to_datetime(aux1['last_access'], unit='ms')
# aux1['join_date_elastic'] = pd.to_datetime(aux1['join_date_elastic'].str.replace('T', ' ').str.replace('Z', ' '), format="%Y-%m-%d %H:%M:%S")
aux1['join_date_elastic'] = aux1['join_date_elastic'].str.replace('T', ' ').str.replace('Z', ' ')
aux1 = aux1.drop(['join_course_date', 'completion_course_date', 'last_access'], axis = 1)

# impute join_date_elastic
aux1.loc[(aux1['join_date_elastic'].isna()) & (aux1['join_course_date_club'].notnull()), 'join_date_elastic'] = aux1['join_course_date_club']


In [None]:
# if user has completed 85%, we will consider it as completed
get_descriptive_statistics(aux1.loc[(aux1['is_course_complete_club'] == 1) & (aux1['progress_club_elastic'].notnull()), :][['progress_club_elastic']])

In [None]:
# 95% percentile is 76% completion rate
get_descriptive_statistics(aux1.loc[(aux1['is_course_complete_club'] == 0) & (aux1['progress_club_elastic'].notnull()), :][['progress_club_elastic']])

In [None]:
print(f"Total rows: {len(aux1):,}")
# if user has completed at least 85%, it will be considered completed course
perc = np.percentile(aux1.loc[(aux1['is_course_complete_club'] == 1) & (aux1['progress_club_elastic'].notnull()), :][['progress_club_elastic']], 25)
aux1.loc[(aux1['progress_club_elastic'] >= perc) & (aux1['progress_club_elastic'].notnull()), 'is_course_complete_club'] = 1
aux1.loc[(aux1['is_course_complete_club'].isna()) & (aux1['progress_club_elastic'].notnull()) & (aux1['progress_club_elastic'] < perc), 'is_course_complete_club'] = 0

# if user has completed course, progress will be 100%
# aux1.loc[(aux1['is_course_complete_club'] == 1), 'progress_club_elastic'] = 100
aux1.loc[(aux1['is_course_complete_club'] == 0) & (aux1['progress_club_elastic'].isna()), 'progress_club_elastic'] = 0
aux1.loc[(aux1['completion_course_date_club'].notnull()), 'progress_club_elastic'] = 100


# calc impact - remove entries without join date
impact_df = len(aux1[(aux1['join_date_elastic'].isna())])
print(f"{impact_df:,} rows ({(impact_df/len(aux1) * 100):.2f}%) will be removed (due to many NAs)")

# remove entries without join date
transactions = aux1[(aux1['join_date_elastic'].isna())][['purchase_id']].drop_duplicates()
transactions['remove'] = 1
aux1 = aux1[(aux1['join_date_elastic'].notnull())]

# remove duplicate transactions (same product)
aux2 = aux1[['user_buyer_id', 'product_id', 'purchase_id']].groupby(['user_buyer_id', 'product_id']).count().reset_index()
aux2 = aux2[aux2['purchase_id'] > 1][['user_buyer_id', 'product_id']].drop_duplicates()
aux2['remove'] = 1
print(f"% of user-product with duplicates : {(len(aux2) / len(df3[['user_buyer_id', 'product_id']].drop_duplicates()) * 100):.2f}%" )
aux3 = aux1.merge(aux2, how = 'left', on = ['user_buyer_id', 'product_id'])
transactions1 = aux3[aux3['remove'] == 1][['purchase_id']].drop_duplicates()
transactions1['remove'] = 1
aux3 = aux3[aux3['remove'] != 1].drop('remove', axis = 1)

# get final df for analysis
aux4 = df3.merge(aux3[['user_buyer_id']].drop_duplicates(), how = 'inner', on = ['user_buyer_id']) # get only users of interest
aux4 = aux4.merge(transactions, how = 'left', on = 'purchase_id')
aux4 = aux4[aux4['remove'] != 1].drop('remove', axis = 1) # remove transactions with null join date
aux4 = aux4.merge(transactions1, how = 'left', on = 'purchase_id')
aux4 = aux4[aux4['remove'] != 1].drop('remove', axis = 1) # remove transactions with duplicates

# update new fields
aux3 = aux3[['user_buyer_id','product_id',  'join_date_elastic',
       'progress_club_elastic', 'is_course_complete_club']].drop_duplicates()
aux3.columns = ['user_buyer_id','product_id',  'join_date_elastic_temp',
       'progress_club_elastic_temp', 'is_course_complete_club_temp']
aux4 = aux4.merge(aux3, how = 'left', on = ['user_buyer_id','product_id']) # update fields
aux4.loc[aux4['join_date_elastic_temp'].notnull(), 'join_date'] = aux4['join_date_elastic_temp']
aux4.loc[aux4['progress_club_elastic_temp'].notnull(), 'progress_club'] = aux4['progress_club_elastic_temp']
aux4.loc[aux4['is_course_complete_club_temp'].notnull(), 'is_course_complete'] = aux4['is_course_complete_club_temp']
print(f"% of valid Club transactions over all df: {len(aux3):,} ({(len(aux3) / len(df3) * 100):.2f}%)" )


#### Overall

In [None]:
aux4['is_club'].value_counts(dropna=False, normalize=True)*100

In [None]:
# set df
aux5 = aux4.copy()
aux5['purchase_id2'] = aux5.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
aux5['is_club2'] = aux5.sort_values('purchase_order_datetime').groupby('user_buyer_id')['is_club'].shift(-1)
aux5 = aux5[aux5['purchase_id2'].notnull()]
aux5['is_course_complete'] = aux5['is_course_complete'].map({0:'incomplete', 1: 'completed'})


print(f"Repurchases in which 1st transaction was a club product: {len(aux5[aux5['is_club'] == 'club']):,}")


In [None]:
# aux6['is_course_complete'].value_counts(dropna=False)
# aux6[aux6['user_buyer_id'] == 50079206]
# aux5[aux5['user_buyer_id'] == 50079206].sort_values('purchase_order_datetime')
# df3[df3['user_buyer_id'] == 50079206].sort_values('purchase_order_datetime')

In [None]:
# set df - answer 1st Q

aux6 = aux5[aux5['is_club'] == 'club'][['user_buyer_id', 'purchase_id', 'user_office_name', 'segment', 'is_below_10_entire', 'join_date', 'completion_course_date', 'progress_club', 'is_course_complete']]
aux7 = aux6[['is_course_complete', 'purchase_id']].groupby(['is_course_complete']).count().reset_index()
aux7['%'] = aux7['purchase_id'] / aux7['purchase_id'].sum() * 100
n_transaction = pd.DataFrame({'local': 'Global', 'n_transaction': [aux7['purchase_id'].sum()]})
# plot graph with annotations
plt.figure(figsize = (10,10))
sns.set_style('white')
splot = sns.barplot(y = 'is_course_complete', x = '%', data = aux7, palette = palette_types, order = ['incomplete', 'completed']);

for p in splot.patches:
    splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', ha='left', va='center')
plt.title("How many repurchases had a previous completed club course? (%)")
plt.yticks(fontsize =16)
plt.xlabel(f"% (Repurchases = {(aux7['purchase_id'].sum()):,})")
plt.ylabel("")
plt.xlim(0, 100)
plt.show()

#### Overall (Completion Rate)

In [None]:
get_descriptive_statistics(aux6[['progress_club']])

In [None]:
aux6[['progress_club']].hist();

In [None]:
# calculate bins
bins = list(np.arange(0,105, 5))

aux6['bin'] = pd.cut(aux6['progress_club'], bins = bins, include_lowest=True)
aux7 = aux6[['bin', 'purchase_id']].groupby(['bin']).count().reset_index()
aux7['%'] = aux7['purchase_id'] / aux7['purchase_id'].sum() * 100

# plot graph with annotations
plt.figure(figsize = (15,25))
sns.set_style('white')
splot = sns.barplot(y = 'bin', x = '%', data = aux7, palette = palette_bins);

for p in splot.patches:
    splot.annotate(" %.2f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = 50, color = '#707780')
splot.set_title(f"Progress Rate (Club)\n" + "Transactions before repurchase (%)\n" + f"N: {(aux7['purchase_id'].sum()):,}\n" + "", color =  '#707780' , fontsize = 40, loc = 'left', pad=40)

# plt.yticks(fontsize =10)
plt.xlabel("")
plt.ylabel("")
plt.xlim(0, 40)
# splot.tick_params(labelsize=30)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 40)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)

plt.show()

#### Producer Office

In [None]:
# get producer office
country_list = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])
plt.figure(figsize = (30,10))

for n, d in enumerate(country_list):
    plt.subplot(2,3,n+1)
    # set df
    aux5 = aux4[aux4['user_office_name'] == d]
    aux5['purchase_id2'] = aux5.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
    aux5['is_club2'] = aux5.sort_values('purchase_order_datetime').groupby('user_buyer_id')['is_club'].shift(-1)
    aux5 = aux5[aux5['purchase_id2'].notnull()]
    aux5['is_course_complete'] = aux5['is_course_complete'].map({0:'incomplete', 1: 'completed'})

    # set df - answer 1st Q
    aux6 = aux5[aux5['is_club'] == 'club'][['user_buyer_id', 'purchase_id', 'user_office_name', 'segment', 'is_below_10_entire', 'join_date', 'progress_club', 'is_course_complete']]
    aux7 = aux6[['is_course_complete', 'purchase_id']].groupby(['is_course_complete']).count().reset_index()
    aux7['%'] = aux7['purchase_id'] / aux7['purchase_id'].sum() * 100
    # print(f"Repurchases in which 1st transaction was a club product: {len(aux5):,}")
    n_transaction = pd.concat([n_transaction, pd.DataFrame({'local': d, 'n_transaction': [aux7['purchase_id'].sum()]})])

    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'is_course_complete', x = '%', data = aux7, palette = palette_types, order = ['incomplete', 'completed']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | N: {(aux7['purchase_id'].sum()):,}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 120)
    splot.axes.get_yaxis().set_visible(False)
    if n == 0:
        splot.axes.get_yaxis().set_visible(True)
    if n == 3:
        splot.axes.get_yaxis().set_visible(True)
    if n == 4:
        plt.xlabel("%")
        #         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.1)
plt.show()

In [None]:
# get producer office
country_list = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])
compl = pd.DataFrame()
for n, d in enumerate(country_list):
    # set df
    aux5 = aux4[aux4['user_office_name'] == d]
    aux5['purchase_id2'] = aux5.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
    aux5['is_club2'] = aux5.sort_values('purchase_order_datetime').groupby('user_buyer_id')['is_club'].shift(-1)
    aux5 = aux5[aux5['purchase_id2'].notnull()]
    aux5['is_course_complete'] = aux5['is_course_complete'].map({0:'incomplete', 1: 'completed'})

    # set df - answer 1st Q
    aux6 = aux5[aux5['is_club'] == 'club'][['user_buyer_id', 'purchase_id', 'user_office_name', 'segment', 'is_below_10_entire', 'join_date', 'progress_club', 'is_course_complete']]
    aux7 = aux6[['is_course_complete', 'purchase_id']].groupby(['is_course_complete']).count().reset_index()
    aux7['%'] = aux7['purchase_id'] / aux7['purchase_id'].sum() * 100
    aux8 = aux7[aux7['is_course_complete'] == 'completed']
    aux8['local'] = d
    compl = pd.concat([compl, aux8])

# get correct % changes: https://davegiles.blogspot.com/2011/03/dummies-for-dummies.html
dict_country = {
    'Global':21.32,
    'Brazil':round(list(compl[compl['local'] == 'BRAZIL']['%'])[0], 2),
    'Colombia':round(list(compl[compl['local'] == 'COLOMBIA']['%'])[0], 2),
    'Mexico':round(list(compl[compl['local'] == 'MEXICO']['%'])[0], 2),
    'Spain':round(list(compl[compl['local'] == 'SPAIN']['%'])[0], 2),
    'USA':round(list(compl[compl['local'] == 'USA']['%'])[0], 2),
    'Amsterdam':round(list(compl[compl['local'] == 'AMSTERDAM']['%'])[0], 2)
               }
# plot perc changes 
aux1 = pd.DataFrame(dict_country.items(), columns = ['local', '%'])
list_order = list(aux1.sort_values('%', ascending=False)['local'])
list_order.remove('Global')
list_order.insert(0, 'Global')
# get sample size
n_transaction['local'] = n_transaction['local'].str.capitalize()
n_transaction.loc[n_transaction['local'] == 'Usa', 'local'] = 'USA'
aux1 = aux1.merge(n_transaction, how = 'left', on = 'local')
n_transaction

In [None]:
aux1

In [None]:
plt.figure(figsize = (10,15))
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'local', x = '%', data = aux1, palette = {'Global': '#EF4E23' , 'Brazil': '#9EA4AC',
                                                                 'Colombia': '#9EA4AC', 'Mexico': '#9EA4AC',
                                                                 'Spain': '#9EA4AC', 'USA': '#9EA4AC', 
                                                                 'Amsterdam': '#9EA4AC'} , order = list_order);
for n in list(aux1['local']):
    if n == 'Global':
        splot.annotate(f"(N: {(int(list(aux1[aux1['local'] == n]['n_transaction'])[0])):,})", 
                       xy = (21, 0), xytext = (36,0.1), color = '#707780', fontsize = 25)
    if n == 'Spain':
        splot.annotate(f"(N: {(int(list(aux1[aux1['local'] == n]['n_transaction'])[0])):,})", 
                       xy = (31, 1.0), xytext = (47,1.1), color = '#707780', fontsize = 25)
    if n == 'USA':
        splot.annotate(f"(N: {(int(list(aux1[aux1['local'] == n]['n_transaction'])[0])):,})", 
                       xy = (31, 2.0), xytext = (47,2.1), color = '#707780', fontsize = 25)
    if n == 'Mexico':
        splot.annotate(f"(N: {(int(list(aux1[aux1['local'] == n]['n_transaction'])[0])):,})", 
                       xy = (28, 3.0), xytext = (45,3.1) , color = '#707780', fontsize = 25)
    if n == 'Colombia':
        splot.annotate(f"(N: {(int(list(aux1[aux1['local'] == n]['n_transaction'])[0])):,})", 
                       xy = (22, 4.0), xytext = (37,4.1), color = '#707780', fontsize = 25)
    if n == 'Brazil':
        splot.annotate(f"(N: {(int(list(aux1[aux1['local'] == n]['n_transaction'])[0])):,})", 
                       xy = (21, 5.0), xytext = (37,5.1), color = '#707780', fontsize = 25)
    if n == 'Amsterdam':
        splot.annotate(f"(N: {(int(list(aux1[aux1['local'] == n]['n_transaction'])[0])):,})", 
                       xy = (16, 6.0), xytext = (33,6.1), color = '#707780', fontsize = 25)
        
for p in splot.patches:
    splot.annotate(" %.2f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = 50, color = '#707780')


splot.set_title(f"How many repurchases\n" + "had a previous completed club course? (%)\n" + "(by producer office)", color =  '#707780' , fontsize = 40, loc = 'left', pad=40)

plt.yticks(fontsize =50)
plt.xlabel("")
plt.ylabel("")
plt.xlim(0, 40)
splot.tick_params(labelsize=30)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 30)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
plt.show()

#### Producer Office (Completion Rate)

In [None]:
# get producer office
country_list = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])
plt.figure(figsize = (15,20))

for n, d in enumerate(country_list):
    plt.subplot(2,3,n+1)
    # set df
    aux5 = aux4[aux4['user_office_name'] == d]
    aux5['purchase_id2'] = aux5.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
    aux5['is_club2'] = aux5.sort_values('purchase_order_datetime').groupby('user_buyer_id')['is_club'].shift(-1)
    aux5 = aux5[aux5['purchase_id2'].notnull()]
    aux5['is_course_complete'] = aux5['is_course_complete'].map({0:'incomplete', 1: 'completed'})

    # set df - answer 1st Q
    aux6 = aux5[aux5['is_club'] == 'club'][['user_buyer_id', 'purchase_id', 'user_office_name', 'segment', 'is_below_10_entire', 'join_date', 'progress_club', 'is_course_complete']]
    # calculate bins
    bins = list(np.arange(0,105, 5))
    aux6['bin'] = pd.cut(aux6['progress_club'], bins = bins, include_lowest=True)
    aux7 = aux6[['bin', 'purchase_id']].groupby(['bin']).count().reset_index()
    aux7['%'] = aux7['purchase_id'] / aux7['purchase_id'].sum() * 100
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'bin', x = '%', data = aux7, palette = palette_bins);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | N: {(aux7['purchase_id'].sum()):,}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 100)
    splot.axes.get_yaxis().set_visible(False)
    if n == 0:
        splot.axes.get_yaxis().set_visible(True)
    if n == 3:
        splot.axes.get_yaxis().set_visible(True)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
plt.subplots_adjust(hspace=0.1)
plt.subplots_adjust(wspace=0.2)
plt.show()

#### Producer Segment

In [None]:
# get producer office
plt.figure(figsize = (30,10))

for n, d in enumerate(['SEED', 'SMALL', 'MEDIUM', 'LARGE']):
    plt.subplot(1,4,n+1)
    # set df
    aux5 = aux4[aux4['segment'] == d]
    aux5['purchase_id2'] = aux5.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
    aux5['is_club2'] = aux5.sort_values('purchase_order_datetime').groupby('user_buyer_id')['is_club'].shift(-1)
    aux5 = aux5[aux5['purchase_id2'].notnull()]
    aux5['is_course_complete'] = aux5['is_course_complete'].map({0:'incomplete', 1: 'completed'})

    # set df - answer 1st Q
    aux6 = aux5[aux5['is_club'] == 'club'][['user_buyer_id', 'purchase_id', 'user_office_name', 'segment', 'is_below_10_entire', 'join_date', 'progress_club', 'is_course_complete']]
    aux7 = aux6[['is_course_complete', 'purchase_id']].groupby(['is_course_complete']).count().reset_index()
    aux7['%'] = aux7['purchase_id'] / aux7['purchase_id'].sum() * 100
    # print(f"Repurchases in which 1st transaction was a club product: {len(aux5):,}")

    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'is_course_complete', x = '%', data = aux7, palette = palette_types, order = ['incomplete', 'completed']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | N: {(aux7['purchase_id'].sum()):,}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 120)
    splot.axes.get_yaxis().set_visible(False)
    if n == 0:
        splot.axes.get_yaxis().set_visible(True)
        #         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.1)
plt.show()

#### Producer segment (Completion Rate)

In [None]:
plt.figure(figsize = (15,15))

for n, d in enumerate(['SEED', 'SMALL', 'MEDIUM', 'LARGE']):
    plt.subplot(1,4,n+1)
    # set df
    aux5 = aux4[aux4['segment'] == d]
    aux5['purchase_id2'] = aux5.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
    aux5['is_club2'] = aux5.sort_values('purchase_order_datetime').groupby('user_buyer_id')['is_club'].shift(-1)
    aux5 = aux5[aux5['purchase_id2'].notnull()]
    aux5['is_course_complete'] = aux5['is_course_complete'].map({0:'incomplete', 1: 'completed'})

    # set df - answer 1st Q
    aux6 = aux5[aux5['is_club'] == 'club'][['user_buyer_id', 'purchase_id', 'user_office_name', 'segment', 'is_below_10_entire', 'join_date', 'progress_club', 'is_course_complete']]
    # calculate bins
    bins = list(np.arange(0,105, 5))
    aux6['bin'] = pd.cut(aux6['progress_club'], bins = bins, include_lowest=True)
    aux7 = aux6[['bin', 'purchase_id']].groupby(['bin']).count().reset_index()
    aux7['%'] = aux7['purchase_id'] / aux7['purchase_id'].sum() * 100
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'bin', x = '%', data = aux7, palette = palette_bins);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | N: {(aux7['purchase_id'].sum()):,}", fontdict = {'fontsize': 20})
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 100)
    splot.axes.get_yaxis().set_visible(False)
    if n == 0:
        splot.axes.get_yaxis().set_visible(True)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.1)
plt.subplots_adjust(wspace=0.2)
plt.show()

#### Producers < 10k BRL LTV

In [None]:
# get producer office
plt.figure(figsize = (30,10))

for n, d in enumerate(['below', 'above']):
    plt.subplot(1,2,n+1)
    # set df
    aux5 = aux4[aux4['is_below_10_entire'] == d]
    aux5['purchase_id2'] = aux5.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
    aux5['is_club2'] = aux5.sort_values('purchase_order_datetime').groupby('user_buyer_id')['is_club'].shift(-1)
    aux5 = aux5[aux5['purchase_id2'].notnull()]
    aux5['is_course_complete'] = aux5['is_course_complete'].map({0:'incomplete', 1: 'completed'})

    # set df - answer 1st Q
    aux6 = aux5[aux5['is_club'] == 'club'][['user_buyer_id', 'purchase_id', 'user_office_name', 'segment', 'is_below_10_entire', 'join_date', 'progress_club', 'is_course_complete']]
    aux7 = aux6[['is_course_complete', 'purchase_id']].groupby(['is_course_complete']).count().reset_index()
    aux7['%'] = aux7['purchase_id'] / aux7['purchase_id'].sum() * 100
    # print(f"Repurchases in which 1st transaction was a club product: {len(aux5):,}")

    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'is_course_complete', x = '%', data = aux7, palette = palette_types, order = ['incomplete', 'completed']);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | N: {(aux7['purchase_id'].sum()):,}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 120)
    splot.axes.get_yaxis().set_visible(False)
    if n == 0:
        splot.axes.get_yaxis().set_visible(True)
        #         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.1)
plt.show()

#### Producers < 10k BRL LTV (Completion Rate)

In [None]:
plt.figure(figsize = (10,15))

for n, d in enumerate(['below', 'above']):
    plt.subplot(1,2,n+1)
    # set df
    aux5 = aux4[aux4['is_below_10_entire'] == d]
    aux5['purchase_id2'] = aux5.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
    aux5['is_club2'] = aux5.sort_values('purchase_order_datetime').groupby('user_buyer_id')['is_club'].shift(-1)
    aux5 = aux5[aux5['purchase_id2'].notnull()]
    aux5['is_course_complete'] = aux5['is_course_complete'].map({0:'incomplete', 1: 'completed'})

    # set df - answer 1st Q
    aux6 = aux5[aux5['is_club'] == 'club'][['user_buyer_id', 'purchase_id', 'user_office_name', 'segment', 'is_below_10_entire', 'join_date', 'progress_club', 'is_course_complete']]
    # calculate bins
    bins = list(np.arange(0,105, 5))
    aux6['bin'] = pd.cut(aux6['progress_club'], bins = bins, include_lowest=True)
    aux7 = aux6[['bin', 'purchase_id']].groupby(['bin']).count().reset_index()
    aux7['%'] = aux7['purchase_id'] / aux7['purchase_id'].sum() * 100
    # plot graph with annotations
    sns.set_style('white')
    splot = sns.barplot(y = 'bin', x = '%', data = aux7, palette = palette_bins);

    for p in splot.patches:
        splot.annotate("%.2f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                       xytext=(5,0), textcoords='offset points', ha='left', va='center')
    plt.title(f"{d} | N: {(aux7['purchase_id'].sum()):,}")
    plt.yticks(fontsize =16)
    plt.xlabel("")
    plt.ylabel("")
    plt.xlim(0, 100)
    splot.axes.get_yaxis().set_visible(False)
    if n == 0:
        splot.axes.get_yaxis().set_visible(True)
#         splot.spines['right'].set_visible(False)
#         splot.spines['top'].set_visible(False)
#         splot.spines['left'].set_visible(False)
#         splot.spines['bottom'].set_visible(False)
    splot.axes.get_xaxis().set_visible(False)
#         splot.axes.get_yaxis().set_visible(False)
# plt.subplots_adjust(hspace=0.1)
plt.subplots_adjust(wspace=0.25)
plt.show()

### Does reorder rate increase upon producer's portfolio size? 

- Producer must have at least 28 buyers (perc. 25th)
- Portfolios above 99th percentile were removed

#### Overall

In [None]:
## https://stats.stackexchange.com/questions/297659/interpretation-of-betareg-coef

In [None]:
# flag if user repurchase
aux1 = df3.copy()
aux2 = aux1[['purchase_id', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).count().reset_index()
aux2['repurchase'] = aux2['purchase_id'].apply(lambda x : 1 if x > 1 else 0)
# get reorder rate by producer
aux3 = aux2[aux2['repurchase'] == 1][['producer_id', 'user_buyer_id']].groupby(['producer_id']).count().reset_index()
aux3.columns = ['producer_id', 'buyers_repurchase_num']
aux4 = aux1[['producer_id', 'user_buyer_id']].groupby(['producer_id']).nunique().reset_index()
aux4.columns = ['producer_id', 'total_buyers']
aux5 = aux3.merge(aux4, how = 'left', on = 'producer_id')
aux5['reorder_rate'] = aux5['buyers_repurchase_num'] / aux5['total_buyers'] * 100
# calc number of products per producer
aux6 = aux1[['producer_id', 'product_id']].groupby(['producer_id']).nunique().reset_index()
aux6.columns = ['producer_id', 'portfolio_size']
aux7 = aux5.merge(aux6, how = 'left', on = 'producer_id')
# calc gmv per product
aux8 = aux1[['producer_id', 'gmv_value_brl']].groupby('producer_id').sum().reset_index()
aux9 = aux7.merge(aux8, how = 'left', on = 'producer_id')
# get user country and segment 
aux10 = aux1[['producer_id', 'segmentation_final_name' , 'user_office_name']].drop_duplicates()
aux10 = aux9.merge(aux10, how = 'left', on = 'producer_id')
aux10 = aux10[aux10['total_buyers'] > np.percentile(aux10['total_buyers'], 25)] # get only producers with more than 50th perc. of total buyers (~28)
# calc gmv per product
aux11 = aux1[['producer_id', 'gmv_value_brl']].groupby('producer_id').median().reset_index()
aux11.columns = ['producer_id', 'median']
aux11 = aux10.merge(aux11, how = 'left', on = 'producer_id')
# calc number of club products
aux12 = aux1[aux1['is_club'] == 'club'][['producer_id', 'product_id']].groupby(['producer_id']).nunique().reset_index()
aux12.columns = ['producer_id', 'count_club_products']
aux12 = aux11.merge(aux12, how = 'left', on = 'producer_id')
aux12['count_club_products'] = aux12['count_club_products'].fillna(0) 
# get sum and avg gmv from producers who repurchase
aux13 = aux1[['gmv_value_brl', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).sum().reset_index()
aux13 = aux13.merge(aux2, how = 'left', on = ['producer_id', 'user_buyer_id'])
aux13 = aux13[aux13['repurchase'] == 1].drop(['repurchase','purchase_id'], axis = 1)
aux13.columns = ['producer_id', 'user_buyer_id', 'sum_gmv']
aux14 = aux13[['producer_id', 'sum_gmv']].groupby(['producer_id']).sum().reset_index()
aux14.columns = ['producer_id', 'sum_repurchase']
aux15 = aux13[['producer_id', 'sum_gmv']].groupby(['producer_id']).mean().reset_index()
aux15.columns = ['producer_id', 'avg_gmv_repurchase']
aux12 = aux12.merge(aux14, how = 'left', on = 'producer_id').merge(aux15, how = 'left', on = 'producer_id')
# # remove outlier - producer with 780 products
aux12 = aux12[aux12['producer_id'] != 12595061]
aux12 = aux12[aux12['producer_id'] != 9944606] # too low gmv vs portfolio
# # remove outlier - 100% reorder rate & extreme portfolio size
aux12 = aux12[aux12['reorder_rate'] != 100]
aux12 = aux12[aux12['portfolio_size'] <= np.percentile(aux12['portfolio_size'], 99)]
aux12 = aux12[aux12['gmv_value_brl'] >= np.percentile(aux12['gmv_value_brl'], 25)]
# flag filter
aux12['is_above_10_entire'] = aux12['gmv_value_brl'].apply(lambda x : 'above' if x >= 10000 else 'below')
aux12.to_csv('data/prod_portfolio.csv')

In [None]:
get_descriptive_statistics(aux12.drop(['producer_id', 'is_above_10_entire', 'segmentation_final_name', 'user_office_name'], axis = 1))

In [None]:
aux12.sort_values('reorder_rate', ascending=False).head(50)

In [None]:
# best performing producer (additional gmv)
(103338.3000 * 1.06) / 76.0870

In [None]:
sns.displot(aux12, x="reorder_rate", kind="kde", bw_adjust=2, height = 6, aspect = 3);

In [None]:
plt.figure(figsize = (20,10))
sns.set_style('white')
splot = sns.regplot(data= aux12, x="portfolio_size", y = 'reorder_rate', color = '#EF4E23')

# splot.set_title(f"Global: all producers, portfolio size vs reorder rates" + "", color =  '#707780' , fontsize = 35, loc = 'center', pad=30)
plt.xlabel("Producer's portfolio size\n" + "(global)", fontsize = 40, labelpad = 30)
plt.ylabel("Reorder rate (%)", fontsize = 40, labelpad = 30)
# plt.xlim(0, 6)
# plt.tick_params(labelsize=30)
# plt.yticks(fontsize =20)
# plt.xticks(fontsize =40)
splot.yaxis.label.set_color('#707780')
splot.xaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.spines['right'].set_color('#707780')
splot.spines['top'].set_color('#707780')
splot.spines['bottom'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 20)
splot.tick_params(axis='x', colors='#707780',labelsize=40, pad = 20)
plt.show()

In [None]:
aux12[['reorder_rate', 'portfolio_size']].corr(method='spearman')

In [None]:
sns.set_style('white')
sns.regplot(data= aux12, x="portfolio_size", y = 'total_buyers');

In [None]:
plt.figure(figsize = (35,20))
n = 1
while n <= 49:
    for f in ['buyers_repurchase_num', 'total_buyers', 'reorder_rate', 'portfolio_size', 'gmv_value_brl', 'median', 'count_club_products']:
        for d in ['buyers_repurchase_num', 'total_buyers', 'reorder_rate', 'portfolio_size', 'gmv_value_brl', 'median', 'count_club_products']:
            plt.subplot(7,7,n)
            n += 1
            sns.regplot(x= f, y = d, data = aux12)
plt.tight_layout() # Or equivalently,  "plt.tight_layout()"
plt.show()

In [None]:
# check whether repurchase could be under club/not-club products
aux1 = df3[['is_club', 'user_buyer_id', 'purchase_id']].groupby(['user_buyer_id', 'is_club']).count().reset_index()
aux2 = aux1[aux1['purchase_id'] != 1]
aux2[['is_club', 'user_buyer_id']].groupby('is_club').nunique()

In [None]:
# check whether repurchase could be under diff product category
aux1 = df3[['product_category', 'user_buyer_id', 'purchase_id']].groupby(['user_buyer_id', 'product_category']).count().reset_index()
aux2 = aux1[aux1['purchase_id'] != 1]
aux2[['product_category', 'user_buyer_id']].groupby('product_category').nunique().sort_values('user_buyer_id')

In [None]:
plt.figure(figsize = (20,10))
plt.subplot(131)
sns.boxplot(x= 'reorder_rate', y = 'segmentation_final_name',  data = aux12)
plt.subplot(132)
sns.boxplot(x= 'reorder_rate', y = 'user_office_name',  data = aux12)
plt.subplot(133)
sns.boxplot(x= 'reorder_rate', y = 'is_above_10_entire',  data = aux12)
plt.tight_layout() # Or equivalently,  "plt.tight_layout()"
plt.show()

In [None]:
plt.figure(figsize = (35,20))
n = 1
while n <= 8:
    for d in (['SEED', 'SMALL', 'MEDIUM', 'LARGE']):
        for f in ['buyers_repurchase_num', 'reorder_rate']:
            plt.subplot(4,2,n)
            plt.title(d)
            n += 1
            sns.regplot(x= 'portfolio_size', y = f, data = aux12[aux12['segmentation_final_name'] == d])
plt.tight_layout() # Or equivalently,  "plt.tight_layout()"
plt.show()

In [None]:
plt.figure(figsize = (20,30))
n = 1
while n <= 12:
    for d in aux10['user_office_name'].unique():
        for f in ['buyers_repurchase_num', 'reorder_rate']:
            plt.subplot(6,2,n)
            plt.title(d)
            n += 1
            sns.regplot(x= 'portfolio_size', y = f, data = aux12[aux12['user_office_name'] == d])
plt.tight_layout() # Or equivalently,  "plt.tight_layout()"
plt.show()

In [None]:
# aux11[aux11['user_office_name'] == 'MEXICO'].sort_values('portfolio_size', ascending=False).head(50)

#### Producer office

In [None]:
# flag if user repurchase
for d in df3['user_office_name'].unique():
    aux1 = df3[df3['user_office_name'] == d]
    aux2 = aux1[['purchase_id', 'producer_id', 'user_buyer_id']].groupby(['producer_id', 'user_buyer_id']).count().reset_index()
    aux2['repurchase'] = aux2['purchase_id'].apply(lambda x : 1 if x > 1 else 0)
    # get reorder rate by producer
    aux3 = aux2[aux2['repurchase'] == 1][['producer_id', 'user_buyer_id']].groupby(['producer_id']).count().reset_index()
    aux3.columns = ['producer_id', 'buyers_repurchase_num']
    aux4 = aux1[['producer_id', 'user_buyer_id']].groupby(['producer_id']).nunique().reset_index()
    aux4.columns = ['producer_id', 'total_buyers']
    aux5 = aux3.merge(aux4, how = 'left', on = 'producer_id')
    aux5['reorder_rate'] = aux5['buyers_repurchase_num'] / aux5['total_buyers'] * 100
    # calc number of products per producer
    aux6 = aux1[['producer_id', 'product_id']].groupby(['producer_id']).nunique().reset_index()
    aux6.columns = ['producer_id', 'portfolio_size']
    aux7 = aux5.merge(aux6, how = 'left', on = 'producer_id')
    # calc gmv per product
    aux8 = aux1[['producer_id', 'gmv_value_brl']].groupby('producer_id').sum().reset_index()
    aux9 = aux7.merge(aux8, how = 'left', on = 'producer_id')
    # get user country and segment 
    aux10 = aux1[['producer_id', 'segmentation_final_name' , 'user_office_name']].drop_duplicates()
    aux10 = aux9.merge(aux10, how = 'left', on = 'producer_id')
    aux10 = aux10[aux10['total_buyers'] > np.percentile(aux10['total_buyers'], 25)] # get only producers with more than 50th perc. of total buyers (~28)
    # calc gmv per product
    aux11 = aux1[['producer_id', 'gmv_value_brl']].groupby('producer_id').median().reset_index()
    aux11.columns = ['producer_id', 'median']
    aux11 = aux10.merge(aux11, how = 'left', on = 'producer_id')
    # calc number of club products
    aux12 = aux1[aux1['is_club'] == 'club'][['producer_id', 'product_id']].groupby(['producer_id']).nunique().reset_index()
    aux12.columns = ['producer_id', 'count_club_products']
    aux12 = aux11.merge(aux12, how = 'left', on = 'producer_id')
    aux12['count_club_products'] = aux12['count_club_products'].fillna(0) 
    # # remove outlier - producer with 780 products
    aux12 = aux12[aux12['producer_id'] != 12595061]
    aux12 = aux12[aux12['producer_id'] != 9944606] # too low gmv vs portfolio
    # # remove outlier - 100% reorder rate & extreme portfolio size
    aux12 = aux12[aux12['reorder_rate'] != 100]
    aux12 = aux12[aux12['portfolio_size'] <= np.percentile(aux12['portfolio_size'], 99)]
    aux12 = aux12[aux12['gmv_value_brl'] >= np.percentile(aux12['gmv_value_brl'], 25)]
    # flag filter
    aux12['is_above_10_entire'] = aux12['gmv_value_brl'].apply(lambda x : 'above' if x >= 10000 else 'below')
    aux12.to_csv(f'data/prod_portfolio_{d}.csv')

In [None]:
np.log(1.02)

In [None]:
np.exp(0.041927)

In [None]:
# get effects (upper bound beta regression)
dict_country = {
    'Global':(0.041927 / 4) * 100,
    'Brazil':(0.039029 / 4) * 100,
    'Colombia':(0.039590 / 4) * 100,
    'Mexico':(0.052041 / 4) * 100,
    'Spain':(0.06696 / 4) * 100,
    'USA':(0.07830 / 4) * 100,
    'Amsterdam':(0.05701 / 4) * 100
               }

In [None]:
dict_country

In [None]:
# plot perc changes 
aux1 = pd.DataFrame(dict_country.items(), columns = ['local', '%'])
list_order = list(aux1.sort_values('%', ascending=False)['local'])
list_order.remove('Global')
list_order.insert(0, 'Global')
plt.figure(figsize = (10,15))
# plot graph with annotations
sns.set_style('white')
splot = sns.barplot(y = 'local', x = '%', data = aux1, palette = {'Global': '#EF4E23' , 'Brazil': '#9EA4AC',
                                                                 'Colombia': '#9EA4AC', 'Mexico': '#9EA4AC',
                                                                 'Spain': '#9EA4AC', 'USA': '#9EA4AC', 
                                                                 'Amsterdam': '#9EA4AC'} , order = list_order);

for p in splot.patches:
    splot.annotate(" %.2f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = 50, color = '#707780')

# plt.title(f"Generated GMV by buyers who repurchase", color =  '#707780' , fontsize = 40)
splot.set_title(f"For each additional product in the\n" +  "portfolio, by how much\n" + "the reorder rates increases\n" + "for the best-performing producer¹?", color =  '#707780' , fontsize = 40, loc = 'left', pad=30)

plt.yticks(fontsize =50)
plt.xlabel("")
plt.ylabel("")
plt.xlim(0, 3)
splot.tick_params(labelsize=30)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 30)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
plt.show()

#### Set df for several scenarios, plot predictions for each additional product on producer's portfolio

In [None]:
# set df
max_num = 20
df = pd.DataFrame({'local': ['Global']* (max_num), 'portfolio_size': np.arange(1, max_num+1, 1)})
df.to_csv(f'data/global_portfolio.csv', index=False)

for n in df3['user_office_name'].unique():
    df = pd.DataFrame({'local': [n]* (max_num), 'portfolio_size': np.arange(1, max_num+1, 1)})
    df.to_csv(f'data/global_portfolio_{n}.csv', index=False)

In [None]:
palette_countries = {'Global': '#EF4E23', 'Brazil': 'silver', 
                     'Colombia': 'darkgrey' , 'Mexico': 'dimgray', 
                     'USA':'c', 'Amsterdam' : 'darkcyan', 'Spain': 'black'}

glob = pd.read_csv(f'data/global_portfolio_predict.csv')
br = pd.read_csv(f'data/global_portfolio_predict_BRAZIL.csv')
col = pd.read_csv(f'data/global_portfolio_predict_COLOMBIA.csv')
mx = pd.read_csv(f'data/global_portfolio_predict_MEXICO.csv')
us = pd.read_csv(f'data/global_portfolio_predict_USA.csv')
ams = pd.read_csv(f'data/global_portfolio_predict_AMSTERDAM.csv')
sp = pd.read_csv(f'data/global_portfolio_predict_SPAIN.csv')
pred = pd.concat([glob, br, col, mx, us, ams, sp]).reset_index(drop=True)
pred['local'] = pred['local'].map({'Global':'Global', 'BRAZIL': 'Brazil', 'COLOMBIA': 'Colombia', 
                                   'MEXICO': 'Mexico', 
                                   'USA': 'USA', 'AMSTERDAM':'Amsterdam', 'SPAIN': 'Spain'})
pred['predict'] = pred['predict'] * 100 


In [None]:
plt.figure(figsize = (30,15))

sns.set_style('whitegrid')
splot= sns.barplot(data = pred[pred['portfolio_size'] <= 10], x = 'portfolio_size', y = 'predict', hue = 'local', palette = palette_countries)
splot.set_title(f"For each additional product in the portfolio, by how much\n" +"does reorder rates increase?", color =  '#707780' , fontsize = 50, loc = 'center', pad=55)
plt.ylabel('Predicted Reorder Rate (%)', fontsize = 50,  labelpad = 20)
plt.xlabel('Additional product', fontsize = 50, labelpad = 20)
plt.yticks(np.arange(0, 14, 1.0), fontsize = 50)
splot.tick_params(labelsize=30)
splot.yaxis.label.set_color('#707780')
splot.xaxis.label.set_color('#707780')
# splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 10)
splot.tick_params(axis='x', colors='#707780',labelsize=50, pad = 20)
#     splot.axes.get_yaxis().set_visible(False)
# splot.spines['right'].set_visible(False)
# splot.spines['top'].set_visible(False)
# splot.spines['left'].set_visible(False)
# splot.spines['bottom'].set_visible(False)
# splot.axes.get_xaxis().set_visible(False)
plt.legend( bbox_to_anchor=(1.02, 1), fancybox=True)
plt.show()

In [None]:
plt.figure(figsize = (30,15))

sns.set_style('whitegrid')
splot= sns.barplot(data = pred[(pred['portfolio_size'] <= 10) & (pred['local'] == 'Global')], 
                   x = 'portfolio_size', y = 'predict', palette = ['#EF4E23'])

for p in splot.patches:
     splot.annotate("%.2f" % p.get_height() + "%", (p.get_x() + p.get_width() / 2., p.get_height()),
         ha='center', va='center', fontsize=45, rotation=0, xytext=(0, 20),
         textcoords='offset points', color =  '#707780') 
        
splot.set_title(f"Predicted Reorder Rate (%), by portfolio size (Global)", color =  '#707780' , fontsize = 50, loc = 'center', pad=55)
plt.ylabel('Predicted Reorder Rate (%)', fontsize = 50,  labelpad = 20)
plt.xlabel('Portfolio Size', fontsize = 50, labelpad = 20)
# plt.yticks(np.arange(0, 14, 1.0), fontsize = 50)
splot.tick_params(labelsize=30)
splot.yaxis.label.set_color('#707780')
splot.xaxis.label.set_color('#707780')
# splot.spines['left'].set_color('#707780')

splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 10)
splot.tick_params(axis='x', colors='#707780',labelsize=50, pad = 30)
# splot.axes.get_xaxis().set_visible(False)
plt.show()

In [None]:
country = 'Brazil'
plt.figure(figsize = (30,15))

sns.set_style('whitegrid')
splot= sns.barplot(data = pred[(pred['portfolio_size'] <= 10) & (pred['local'] == country)], 
                   x = 'portfolio_size', y = 'predict', palette = ['#EF4E23'])

for p in splot.patches:
     splot.annotate("%.2f" % p.get_height() + "%", (p.get_x() + p.get_width() / 2., p.get_height()),
         ha='center', va='center', fontsize=45, rotation=0, xytext=(0, 20),
         textcoords='offset points', color =  '#707780') 
        
splot.set_title(f"Predicted Reorder Rate (%), by portfolio size ({country})", color =  '#707780' , fontsize = 50, loc = 'center', pad=55)
plt.ylabel('Predicted Reorder Rate (%)', fontsize = 50,  labelpad = 20)
plt.xlabel('Portfolio Size', fontsize = 50, labelpad = 20)
# plt.yticks(np.arange(0, 14, 1.0), fontsize = 50)
splot.tick_params(labelsize=30)
splot.yaxis.label.set_color('#707780')
splot.xaxis.label.set_color('#707780')
# splot.spines['left'].set_color('#707780')

splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 10)
splot.tick_params(axis='x', colors='#707780',labelsize=50, pad = 30)
# splot.axes.get_xaxis().set_visible(False)
plt.show()

### Why Large producers generate 2x more GMV than other segments?

##### Repurchase Count is not different among segments

In [None]:
category = ['LARGE', 'MEDIUM', 'SMALL', 'SEED']
count_repurchase = pd.DataFrame()

for n, d in enumerate(category):
    # get number of repurchases
    aux1 = temp[temp['segment'] == d]
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby(['user_buyer_id']).count().reset_index().drop('user_buyer_id', axis = 1)
    aux2['segment'] = d
    aux2.columns = ['repurchase_count', 'segment']
    count_repurchase = pd.concat([count_repurchase, aux2])
count_repurchase = count_repurchase[count_repurchase['repurchase_count'] > 1]
aux1 = count_repurchase[count_repurchase['segment'] == 'LARGE'].drop('segment', axis = 1)
aux1.columns = ['LARGE']
aux2 = count_repurchase[count_repurchase['segment'] == 'MEDIUM'].drop('segment', axis = 1)
aux2.columns = ['MEDIUM']
aux3 = count_repurchase[count_repurchase['segment'] == 'SMALL'].drop('segment', axis = 1)
aux3.columns = ['SMALL']
aux4 = count_repurchase[count_repurchase['segment'] == 'SEED'].drop('segment', axis = 1)
aux4.columns = ['SEED']
pd.concat([get_descriptive_statistics(aux1), get_descriptive_statistics(aux2) , get_descriptive_statistics(aux3), get_descriptive_statistics(aux4)])

##### Avg. Ticket Price between buyers who repurchase vs don't repurchase is not different among segments

In [None]:
category = ['LARGE', 'MEDIUM', 'SMALL', 'SEED']
avg_ticket = pd.DataFrame()

for n, d in enumerate(category):
    # get number of repurchases
    aux1 = temp[temp['segment'] == d]
    ### flag repurchase
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x > 1 else 'single-purchase')  
    
    # sum gmv for each buyer
    aux2 = aux1[['user_buyer_id', 'repurchase_temp', 'no_repurchase_temp', 'gmv_value_brl']].groupby(['user_buyer_id', 'repurchase_temp', 'no_repurchase_temp']).sum().reset_index()
    aux2['segment'] = d
    aux2.columns = ['user_buyer_id', 'repurchase_temp', 'no_repurchase_temp', 'sum_gmv', 'segment']
    avg_ticket = pd.concat([avg_ticket, aux2])

# remove buyers with zero gmv
avg_ticket = avg_ticket[avg_ticket['sum_gmv'] > 0]
# median_gmv = avg_ticket[(avg_ticket['repurchase_temp'] == 'repurchase') & (avg_ticket['segment'] == 'LARGE')]['sum_gmv'].median()
# median_gmv_not = avg_ticket[(avg_ticket['repurchase_temp'] == 'single-purchase') & (avg_ticket['segment'] == 'LARGE')]['sum_gmv'].median()

# set df
aux1 = avg_ticket[(avg_ticket['segment'] == 'LARGE') & (avg_ticket['repurchase_temp'] == 'repurchase')].drop(['segment', 'repurchase_temp', 'no_repurchase_temp', 'user_buyer_id'], axis = 1)
aux1.columns = ['LARGE-repurchase']
aux11 = avg_ticket[(avg_ticket['segment'] == 'LARGE') & (avg_ticket['repurchase_temp'] == 'single-purchase')].drop(['segment', 'repurchase_temp','no_repurchase_temp', 'user_buyer_id'], axis = 1)
aux11.columns = ['LARGE-single-purchase']
aux2 = avg_ticket[(avg_ticket['segment'] == 'MEDIUM') & (avg_ticket['repurchase_temp'] == 'repurchase')].drop(['segment', 'repurchase_temp','no_repurchase_temp', 'user_buyer_id'], axis = 1)
aux2.columns = ['MEDIUM-repurchase']
aux22 = avg_ticket[(avg_ticket['segment'] == 'MEDIUM') & (avg_ticket['repurchase_temp'] == 'single-purchase')].drop(['segment', 'repurchase_temp','no_repurchase_temp', 'user_buyer_id'], axis = 1)
aux22.columns = ['MEDIUM-single-purchase']
aux3 = avg_ticket[(avg_ticket['segment'] == 'SMALL') & (avg_ticket['repurchase_temp'] == 'repurchase')].drop(['segment', 'repurchase_temp','no_repurchase_temp', 'user_buyer_id'], axis = 1)
aux3.columns = ['SMALL-repurchase']
aux33 = avg_ticket[(avg_ticket['segment'] == 'SMALL') & (avg_ticket['repurchase_temp'] == 'single-purchase')].drop(['segment', 'repurchase_temp','no_repurchase_temp', 'user_buyer_id'], axis = 1)
aux33.columns = ['SMALL-single-purchase']
aux4 = avg_ticket[(avg_ticket['segment'] == 'SEED') & (avg_ticket['repurchase_temp'] == 'repurchase')].drop(['segment', 'repurchase_temp','no_repurchase_temp', 'user_buyer_id'], axis = 1)
aux4.columns = ['SEED-repurchase']
aux44 = avg_ticket[(avg_ticket['segment'] == 'SEED') & (avg_ticket['repurchase_temp'] == 'single-purchase')].drop(['segment', 'repurchase_temp','no_repurchase_temp', 'user_buyer_id'], axis = 1)
aux44.columns = ['SEED-single-purchase']

avg_ticket['segment_type'] = avg_ticket['segment'] + '-' + avg_ticket['repurchase_temp']
avg_ticket = avg_ticket.reset_index(drop=True)
pd.concat([get_descriptive_statistics(aux1), get_descriptive_statistics(aux11) 
           , get_descriptive_statistics(aux2), get_descriptive_statistics(aux22)
           , get_descriptive_statistics(aux3), get_descriptive_statistics(aux33)
           , get_descriptive_statistics(aux4), get_descriptive_statistics(aux44)
          ])

In [None]:
# # start = round(np.percentile(avg_ticket[['sum_gmv']], 25), 2)
# splot = sns.kdeplot(data = avg_ticket[avg_ticket['segment'] == 'LARGE'], x = 'sum_gmv', 
#         hue = 'repurchase_temp')
# plt.title(f'LARGE')
# plt.ylabel("")
# # plt.legend("")
# sns.move_legend(splot,  "upper left", bbox_to_anchor=(1, 1))
# plt.subplots_adjust(hspace=0.5)
# plt.subplots_adjust(wspace=1)
# plt.show()

In [None]:
# aux1 = avg_ticket.copy()
# # <= np.percentile(avg_ticket['sum_gmv'], 95)
# aux2 = aux1[aux1['segment_type'] == 'LARGE-repurchase']
# aux2 = aux2[aux2['sum_gmv'] <= np.percentile(aux2['sum_gmv'], 95)]
# aux3 = aux1[aux1['segment_type'] == 'LARGE-single-purchase']
# aux3 = aux3[aux3['sum_gmv'] <= np.percentile(aux3['sum_gmv'], 95)]

# aux4 = aux1[aux1['segment_type'] == 'MEDIUM-repurchase']
# aux4 = aux4[aux4['sum_gmv'] <= np.percentile(aux4['sum_gmv'], 95)]
# aux5 = aux1[aux1['segment_type'] == 'MEDIUM-single-purchase']
# aux5 = aux5[aux5['sum_gmv'] <= np.percentile(aux5['sum_gmv'], 95)]

# aux6 = aux1[aux1['segment_type'] == 'SMALL-repurchase']
# aux6 = aux6[aux6['sum_gmv'] <= np.percentile(aux6['sum_gmv'], 95)]
# aux7 = aux1[aux1['segment_type'] == 'SMALL-single-purchase']
# aux7 = aux7[aux7['sum_gmv'] <= np.percentile(aux7['sum_gmv'], 95)]
            
# aux8 = aux1[aux1['segment_type'] == 'SEED-repurchase']
# aux8 = aux8[aux8['sum_gmv'] <= np.percentile(aux8['sum_gmv'], 95)]
# aux9 = aux1[aux1['segment_type'] == 'SEED-single-purchase']
# aux9 = aux9[aux9['sum_gmv'] <= np.percentile(aux9['sum_gmv'], 95)]
# aux10 = pd.concat([aux2,aux3,aux4,aux5,aux6,aux7,aux8,aux9])
# get_descriptive_statistics(aux10[['gmv_value_brl']])

In [None]:
ff = df3[['segment', 'gmv_value_brl']]
ff = ff[ff['gmv_value_brl'] > 0 ]
ff = ff[ff['gmv_value_brl'] < np.percentile(ff['gmv_value_brl'], 95)]
# get_descriptive_statistics(ff[['gmv_value_brl']])

In [None]:
# plt.figure(figsize = (30,15))
# for d, n in enumerate(['LARGE', 'MEDIUM', 'SMALL', 'SEED']):
#     plt.subplot(4, 1, d+1)
#     sns.histplot(data = aux10[aux10['segment'] == n], x = 'sum_gmv', 
#                  palette = {'single-purchase': 'b', 'repurchase': 'darkorange'},
#                 hue = 'repurchase_temp', 
#                 element = 'step')
#     plt.title(f'{n}')
#     plt.ylabel('')
#     if d != 3:
#         plt.legend('')
# plt.show()

In [None]:
# # show boxplot
# sns.boxplot(data = avg_ticket, x = 'sum_gmv', y = 'segment_type');

##### Generate Table

In [None]:
# calculate repurchase
category = ['LARGE', 'MEDIUM', 'SMALL', 'SEED']
reorder = pd.DataFrame()
share_gmv = pd.DataFrame()
avg_gmv = pd.DataFrame()
avg_gmv_not = pd.DataFrame()
overall_gmv = pd.DataFrame()
avg_repurchase = pd.DataFrame()
avg_ticket = pd.DataFrame()

for n, d in enumerate(category):
    ### count purchases per user
    aux1 = df3[(df3['segment'] == d)]
    ### global median
    global_med = pd.DataFrame({'segment' : [d], 'overall_avg_gmv': aux1['gmv_value_brl'].median()})
    overall_gmv = pd.concat([overall_gmv, global_med])
    ### flag repurchase
    aux2 = aux1[['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    aux2.columns = ['user_buyer_id', 'no_repurchase_temp']
    aux1 = aux1.merge(aux2, how = 'left', on = 'user_buyer_id')
    aux1['repurchase_temp'] = aux1['no_repurchase_temp'].apply(lambda x : 'repurchase' if x > 1 else 'single-purchase')    
    
    ### calculate gmv captured by users who repurchase 
    share = aux1[['repurchase_temp', 'gmv_value_brl']].groupby('repurchase_temp').sum().reset_index()
    share['gmv_share_repurchase'] = share['gmv_value_brl'] / share['gmv_value_brl'].sum() * 100
    share = share[share['repurchase_temp'] == 'repurchase']
    share['segment'] = d
    share.columns = ['repurchase', 'gmv_value_brl_repurchase', 'gmv_share_repurchase', 'segment']
    share = share[['segment', 'gmv_share_repurchase', 'gmv_value_brl_repurchase']]
    share_gmv = share_gmv.append(share)
    
    ## calculate median gmv from users who repurchase vs don't repurchase
    # sum gmv for each buyer
    aux2 = aux1[['user_buyer_id', 'repurchase_temp', 'gmv_value_brl']].groupby(['user_buyer_id', 'repurchase_temp']).sum().reset_index().drop('user_buyer_id', axis = 1)
    aux2.columns = ['repurchase_temp', 'sum_gmv']
    # remove buyers with zero gmv
    aux2 = aux2[aux2['sum_gmv'] > 0]
    avg_gmv = pd.concat([avg_gmv, pd.DataFrame({'segment' : [d], 'avg_gmv_repurchase': aux2[(aux2['repurchase_temp'] == 'repurchase')]['sum_gmv'].median()})])
    avg_gmv_not = pd.concat([avg_gmv_not, pd.DataFrame({'segment' : [d], 'avg_gmv_repurchase_not': aux2[(aux2['repurchase_temp'] == 'single-purchase')]['sum_gmv'].median()})])

    ## calculate median repurchase count from users who repurchase
    mr = aux1[aux1['repurchase_temp'] == 'repurchase'][['user_buyer_id', 'purchase_id']].groupby('user_buyer_id').count().reset_index()
    avg_repurchase = avg_repurchase.append(pd.DataFrame({'segment': [d], 'mean_repurchase_count': mr['purchase_id'].mean()}))
    
    # get vol of users with purchase
    rr = aux1[['no_repurchase_temp','repurchase_temp',  'user_buyer_id']].groupby(['repurchase_temp','no_repurchase_temp']).nunique().reset_index()
    rr['%'] = rr['user_buyer_id'] / rr['user_buyer_id'].sum() * 100
    rr = rr.sort_values('%', ascending=False)
    # get users who repurchase
    rr2 = rr[['repurchase_temp', 'user_buyer_id']].groupby('repurchase_temp').sum().reset_index()
    rr2['%'] = rr2['user_buyer_id'] / rr2['user_buyer_id'].sum() * 100
    rr3 = rr2[rr2['repurchase_temp'] == 'repurchase'][['%']]
    rr3['segment'] = d
    rr3 = rr3[['segment', '%']]
    reorder = reorder.append(rr3)
    
# sort df
reorder = reorder.sort_values('%', ascending=False)
share_gmv = share_gmv.sort_values('gmv_share_repurchase', ascending=False)
avg_gmv = avg_gmv.sort_values('avg_gmv_repurchase', ascending=False)
avg_gmv_not = avg_gmv_not.sort_values('avg_gmv_repurchase_not', ascending=False)
avg_repurchase = avg_repurchase.sort_values('mean_repurchase_count', ascending=False)

# set main dataset
reorder = reorder.merge(share_gmv, 
                        how = 'left', 
                        on = 'segment').merge(avg_gmv, 
                                                    how = 'left', 
                                                    on = 'segment').merge(avg_gmv_not,
                                                                          how = 'left',
                                                                          on = 'segment').merge(avg_repurchase,
                                                                          how = 'left',
                                                                          on = 'segment').merge(overall_gmv, 
                                                                                                how = 'left', 
                                                                                                on = 'segment').sort_values('%', ascending=False)
# reorder['median_lower'] = reorder.apply(lambda x: 1 if x['avg_gmv_repurchase'] < x['overall_avg_gmv'] else 0, axis = 1)
# reorder['diff_median'] = (reorder['avg_gmv_repurchase'] - reorder['overall_avg_gmv']) / reorder['overall_avg_gmv'] * 100
reorder['median_lower'] = reorder.apply(lambda x: 1 if x['avg_gmv_repurchase'] < x['avg_gmv_repurchase_not'] else 0, axis = 1)
reorder['diff_median'] = (reorder['avg_gmv_repurchase'] - reorder['avg_gmv_repurchase_not']) / reorder['avg_gmv_repurchase_not'] * 100
reorder['diff_median_abs'] = abs(reorder['diff_median'])
reorder['order'] = reorder['segment'].map({'LARGE': 4, 'MEDIUM': 3, 'SMALL': 2, 'SEED': 1})
reorder = reorder.sort_values('order', ascending=False)

# reorder[['gmv_share_repurchase', 'diff_median']].corr()
# sns.lmplot(x ='gmv_share_repurchase', y = 'diff_median', data = reorder);

# # set palette
# palette_categories_analysis = list()
# for n in list(reorder['segment']):
#     if int(reorder[reorder['segment'] == n]['median_lower']) == 1:
#         palette_categories_analysis = palette_categories_analysis + ['r']
#     else:
#         palette_categories_analysis = palette_categories_analysis + ['g']
# set palette
palette_categories_analysis = {'LARGE': '#EF4E23', 'MEDIUM':'#9EA4AC', 'SMALL':'#9EA4AC', 'SEED':'#9EA4AC'} # grey, orange

fonttitle = 45
annotate_size = 45

In [None]:
plt.figure(figsize = (43,40))
plt.subplot(151)
# plot graph with annotations

# '#707780' (grey)

sns.set_style('white')
splot = sns.barplot(y = 'segment', x = '%', data = reorder, palette = palette_categories_analysis, order = category);

for p in splot.patches:
    splot.annotate(" %.2f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Reorder Rate (%)\n" + "\n" + " ", color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 50)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 30)
#     splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)
# plt.xlabel(f"{aux3['user_buyer_id'].sum():,}", color = '#707780', fontsize = 30)

plt.subplot(152)
sns.set_style('white')
splot = sns.barplot(y = 'segment', x = 'gmv_share_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = category);

for p in splot.patches:
    splot.annotate(" %.2f" % p.get_width() + '%', xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Generated GMV (%)\n" + "from buyers\n" + "who repurchase " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 50)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)

plt.subplot(153)
sns.set_style('white')
splot = sns.barplot(y = 'segment', x = 'mean_repurchase_count', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = category);

for p in splot.patches:
    splot.annotate(" %.4f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
    
splot.set_title(f"Avg. Repurchase\n" + "Count\n" + "" , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(2, 3)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)

# plt.subplot(154)
# sns.set_style('white')
# splot = sns.barplot(y = 'segment', x = 'overall_avg_gmv', data = reorder, 
#                     palette = palette_categories_analysis, 
#                     order = category);

# for p in splot.patches:
#     splot.annotate(" %.1f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
#                    xytext=(5,0), textcoords='offset points', 
#                    ha='left', va='center', fontsize = annotate_size, color = '#707780')
# splot.set_title(f"Global Avg. Ticket\n" + "Price (BRL)\n" + " " , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# # plt.yticks(fontsize =20)
# plt.ylabel("")
# plt.xlim(0, 400)
# # splot.tick_params(labelsize=15)
# splot.yaxis.label.set_color('#707780')
# splot.spines['left'].set_color('#707780')
# # splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
# splot.axes.get_yaxis().set_visible(False)
# splot.spines['right'].set_visible(False)
# splot.spines['top'].set_visible(False)
# splot.spines['left'].set_visible(False)
# splot.spines['bottom'].set_visible(False)
# splot.axes.get_xaxis().set_visible(False)

plt.subplot(154)
sns.set_style('white')
splot = sns.barplot(y = 'segment', x = 'avg_gmv_repurchase_not', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = category);

for p in splot.patches:
    splot.annotate(" %.1f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
                   xytext=(5,0), textcoords='offset points', 
                   ha='left', va='center', fontsize = annotate_size, color = '#707780')
splot.set_title(f"Avg. Ticket Price (BRL)\n" + "from buyers who\n" + "do not repurchase" , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 1500)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)


plt.subplot(155)
sns.set_style('white')
splot = sns.barplot(y = 'segment', x = 'avg_gmv_repurchase', data = reorder, 
                    palette = palette_categories_analysis, 
                    order = category);

for n, p in zip(reorder['diff_median'], splot.patches):
    splot.annotate(" %.1f" % p.get_width() + ' ' + "(%.1f p.p)" % n, xy=(p.get_width(), 
                                                                      p.get_y()+p.get_height()/2),xytext=(5,0), 
    textcoords='offset points', ha='left', va='center', fontsize = annotate_size, color = '#707780')

splot.set_title(f"Avg. Ticket Price (BRL)\n" + "from buyers who repurchase\n" + "" , color =  '#707780' , fontsize = fonttitle, loc = 'left', pad=40)

# plt.yticks(fontsize =20)
plt.ylabel("")
plt.xlim(0, 1500)
# splot.tick_params(labelsize=15)
splot.yaxis.label.set_color('#707780')
splot.spines['left'].set_color('#707780')
# splot.tick_params(axis='y', colors='#707780',labelsize=40, pad = 30)
splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.axes.get_xaxis().set_visible(False)

# plt.subplots_adjust(hspace=0.2)
# plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.05)
plt.show()

## Time

### What is the mean time to 2nd purchase? (by producer office, producer segment, producers below 10k BRL LTV)

#### Overall

- In this analysis, we are considering all repurchases. Repurchases with delta < 1d were removed.

In [None]:
# get only repurchases
aux1 = df3.copy()
aux1['purchase_id2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
aux1['2nd_purchase'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_order_datetime'].shift(-1)
aux1 = aux1[aux1['purchase_id2'].notnull()]

# delta purchases
aux1['delta_days'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() / 60 / 60 /24
aux1['delta_sec'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() 
# pd.concat([get_descriptive_statistics(aux1[['delta_days']]), get_descriptive_statistics(aux1[['delta_sec']])])

# get median delta per user
aux2 = aux1[['user_buyer_id', 'delta_days']].groupby('user_buyer_id').median().reset_index()
aux2.columns = ['user_buyer_id', 'median_delta_days_user']
median_threshold= np.percentile(aux2[['median_delta_days_user']], 50)
# merge df
aux3 = aux1.merge(aux2, how = 'inner', on = 'user_buyer_id')
# flag users
aux3['class'] = aux3['median_delta_days_user'].apply(lambda x : 'above' if x >= median_threshold
                                        else 'below')
pd.concat([get_descriptive_statistics(aux2[['median_delta_days_user']]), get_descriptive_statistics(aux3[['delta_days']])])

In [None]:
get_descriptive_statistics(aux2[['median_delta_days_user']])

In [None]:
print(f"5th percentile: {np.percentile(aux2[['median_delta_days_user']], 5):.2f} days") 
print(f"10th percentile: {np.percentile(aux2[['median_delta_days_user']], 10):.2f} days") 
print(f"15th percentile: {np.percentile(aux2[['median_delta_days_user']], 15):.2f} days") 

In [None]:
sns.set_style('white')
splot = sns.kdeplot(data = aux2, x = 'median_delta_days_user', color = '#EF4E23', fill=True, alpha=.1, linewidth=2
            )

splot.set_title(f"Median time to repurchase (days) per user", color =  '#707780' , 
                fontsize = 50, loc = 'center', pad=70)
# plt.ylabel('customers', fontsize = 50,  labelpad = 20)
plt.xlabel('Median (days)', fontsize = 50, labelpad = 20)

# plt.yticks(np.arange(0, 14, 1.0), fontsize = 50)
splot.tick_params(labelsize=30)
splot.yaxis.label.set_color('#707780')
splot.xaxis.label.set_color('#707780')
# splot.spines['left'].set_color('#707780')

splot.axes.get_yaxis().set_visible(False)
splot.spines['right'].set_visible(False)
splot.spines['top'].set_visible(False)
splot.spines['left'].set_visible(False)
splot.spines['bottom'].set_visible(False)
splot.tick_params(axis='y', colors='#707780',labelsize=50, pad = 10)
splot.tick_params(axis='x', colors='#707780',labelsize=50, pad = 30)
# splot.axes.get_xaxis().set_visible(False)

label_median = f"median: {aux2['median_delta_days_user'].median():.2f}d"
plt.axvline(x=aux2['median_delta_days_user'].median(), label = label_median, color =  '#707780')
# plt.legend(bbox_to_anchor=(0.25,0.8))
plt.text(70, 0.01, label_median, color = 'white', fontsize = 30, bbox = dict(boxstyle='square', 
                                                              facecolor='darkcyan', 
                                                              alpha=1,
                                                             pad = 0.8))

plt.show()

In [None]:
sns.set_style('white')
sns.histplot(data = aux2[aux2['median_delta_days_user'] <= np.percentile(aux2[['median_delta_days_user']], 25)], x = 'median_delta_days_user', 
            kde = True, color = 'darkcyan'
            )
plt.title('Time to repurchase (on avg) per user, below 25th perc.')
plt.xlabel('median delta (days)')
plt.ylabel('customers')
plt.show()

#### Overall - GMV from users who repurchase above median time vs below

In [None]:
# get df 
aux4 = df3.merge(aux3[['user_buyer_id', 'class']].drop_duplicates(), how = 'inner', on = 'user_buyer_id')[['user_buyer_id', 'class', 'gmv_value_brl']]
# get users below threshold
aux5 = aux4[aux4['class'] == 'below'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux5.columns = ['user_buyer_id', 'below']
# get users above threshold
aux6 = aux4[aux4['class'] == 'above'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux6.columns = ['user_buyer_id', 'above']
pd.concat([get_descriptive_statistics(aux5[['below']]), get_descriptive_statistics(aux6[['above']])])

In [None]:
aux7 = aux4[['user_buyer_id', 'class', 'gmv_value_brl']].groupby(['user_buyer_id', 'class']).sum().reset_index()
plt.figure(figsize = (30,10))
for d, n in enumerate([np.percentile(aux7['gmv_value_brl'], 5), np.percentile(aux7['gmv_value_brl'], 10), 
                       np.percentile(aux7['gmv_value_brl'], 25), np.percentile(aux7['gmv_value_brl'], 50),
                       np.percentile(aux7['gmv_value_brl'], 75), np.percentile(aux7['gmv_value_brl'], 95)]):
    plt.subplot(3,2,d+1)
    if d == 0:
        aux8 = aux7[(aux7['gmv_value_brl'] <= n)]
        prev_n = n
    else:
        aux8 = aux7[(aux7['gmv_value_brl'] > prev_n) & (aux7['gmv_value_brl'] <= n)]
    sns.set_style('white')
    splot = sns.histplot(data = aux8, x = 'gmv_value_brl', hue = 'class',
                                      palette = {'below': 'darkcyan', 'above': 'darkorange'},   element = 'step')
    if d == 0:
        plt.title(f"5th perc. (0-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
    if d == 1:
        plt.title(f"10th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
    if d == 2:
        plt.title(f"25th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n        
    if d == 3:
        plt.title(f"50th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
    if d == 4:
        plt.title(f"75th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n        
    if d == 5:
        plt.title(f"90th perc. ({(prev_n):.2f}-{n:,.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        
    splot.axes.get_yaxis().set_visible(False)
    if d == 0:
        splot.axes.get_yaxis().set_visible(True)
        plt.xlabel('')
        plt.ylabel('users')
    if d == 1:
        splot.axes.get_yaxis().set_visible(True)
        plt.xlabel('')
        plt.ylabel('')
    if d == 2:
        splot.axes.get_yaxis().set_visible(True)
        plt.xlabel('')
        plt.ylabel('users')
    if d == 3:
        splot.axes.get_yaxis().set_visible(True)
        plt.xlabel('')
        plt.ylabel('')
    if d == 4:
        splot.axes.get_yaxis().set_visible(True)
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('users')
    if d == 5:
        splot.axes.get_yaxis().set_visible(True)
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('')
plt.subplots_adjust(wspace=0.1)
plt.subplots_adjust(hspace=0.55)
plt.show()

#### Producer office

In [None]:
# get producer office
country_list = list(df3[['user_office_name', 'purchase_id']].groupby('user_office_name').count().reset_index().sort_values('purchase_id', ascending=False)['user_office_name'])
plt.figure(figsize = (25,15))
descr_stats = pd.DataFrame()
for n, d in enumerate(country_list):
    plt.subplot(2,3,n+1)
    # get only repurchases
    aux1 = df3[df3['user_office_name'] == d]
    aux1['purchase_id2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
    aux1['2nd_purchase'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_order_datetime'].shift(-1)
    aux1 = aux1[aux1['purchase_id2'].notnull()]

    # delta
    aux1['delta_days'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() / 60 / 60 /24
    aux1['delta_sec'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() 
    # get median delta per user
    aux2 = aux1[['user_buyer_id', 'delta_days']].groupby('user_buyer_id').median().reset_index()
    aux2.columns = ['user_buyer_id', 'median_delta_days_user']
    median_threshold= np.percentile(aux2[['median_delta_days_user']], 50)
    aux3 = aux2.copy()
    aux3.columns = ['user_buyer_id', d]
    descr_stats = descr_stats.append(get_descriptive_statistics(aux3[[d]]))
    sns.set_style('white')
    splot = sns.kdeplot(data = aux2, x = 'median_delta_days_user', color = '#EF4E23', fill=True, alpha=.1, linewidth=2
            )
    plt.title(f"{d} | N: {(len(aux2)):,}")

    label_median = f"median ({aux2['median_delta_days_user'].median():.1f}d)"
    plt.axvline(x=aux2['median_delta_days_user'].median(), label = label_median, color = 'black')
    plt.legend()
    plt.xlabel('')
    plt.ylabel('')    
    if n == 0:
        plt.ylabel('buyers') 
    if n == 3:
        plt.xlabel('median delta (days)')
        plt.ylabel('buyers') 
    if n == 4:
        plt.xlabel('median delta (days)')
    if n == 5:
        plt.xlabel('median delta (days)')
plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0.3)
plt.show()


In [None]:
# median time to repurchase
descr_stats

#### Producer office - GMV from users who repurchase above median time vs below

##### BRAZIL

In [None]:
# get only repurchases
name = 'BRAZIL'
aux1 = df3[df3['user_office_name'] == name]
aux1['purchase_id2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
aux1['2nd_purchase'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_order_datetime'].shift(-1)
aux1 = aux1[aux1['purchase_id2'].notnull()]

# delta
aux1['delta_days'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() / 60 / 60 /24
aux1['delta_sec'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() 

# get users who have median threshold
aux2 = aux1[['user_buyer_id', 'delta_days']].groupby('user_buyer_id').median().reset_index()
aux2.columns = ['user_buyer_id', 'median_delta_days_user']
median_threshold= np.percentile(aux2[['median_delta_days_user']], 50)
print(f"Number of Repurchases: {len(aux1):,}") 
print(f"50th percentile: {median_threshold:.2f} days") 
# merge df
aux3 = aux1.merge(aux2, how = 'inner', on = 'user_buyer_id')
# flag users
aux3['class'] = aux3['median_delta_days_user'].apply(lambda x : 'above' if x >= median_threshold
                                        else 'below')
get_descriptive_statistics(aux2[['median_delta_days_user']])

In [None]:
# get df 
aux4 = df3.merge(aux3[['user_buyer_id', 'class']].drop_duplicates(), how = 'inner', on = 'user_buyer_id')
aux4 = aux4[aux4['user_office_name'] == name][['user_buyer_id', 'class', 'gmv_value_brl']]
# get users below threshold
aux5 = aux4[aux4['class'] == 'below'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux5.columns = ['user_buyer_id', 'below']
# get users above threshold
aux6 = aux4[aux4['class'] == 'above'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux6.columns = ['user_buyer_id', 'above']
pd.concat([get_descriptive_statistics(aux5[['below']]), get_descriptive_statistics(aux6[['above']])])

In [None]:
aux7 = aux4[['user_buyer_id', 'class', 'gmv_value_brl']].groupby(['user_buyer_id', 'class']).sum().reset_index()
plt.figure(figsize = (30,10))
for d, n in enumerate([np.percentile(aux7['gmv_value_brl'], 5), np.percentile(aux7['gmv_value_brl'], 10), 
                       np.percentile(aux7['gmv_value_brl'], 25), np.percentile(aux7['gmv_value_brl'], 50),
                       np.percentile(aux7['gmv_value_brl'], 75), np.percentile(aux7['gmv_value_brl'], 95)]):
    plt.subplot(3,2,d+1)
    if d == 0:
        aux8 = aux7[(aux7['gmv_value_brl'] <= n)]
        prev_n = n
    else:
        aux8 = aux7[(aux7['gmv_value_brl'] > prev_n) & (aux7['gmv_value_brl'] <= n)]
    sns.set_style('white')
    splot = sns.histplot(data = aux8, x = 'gmv_value_brl', hue = 'class',
                                      palette = {'below': 'darkcyan', 'above': 'darkorange'},   element = 'step')
    splot.axes.get_yaxis().set_visible(False)
    if d == 0:
        plt.title(f"5th perc. (0-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('users')
    if d == 1:
        plt.title(f"10th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        plt.xlabel('')
        plt.ylabel('')
        sns.move_legend(splot, "center left", bbox_to_anchor=(1, 0.5), title='Species')
    if d == 2:
        plt.title(f"25th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n   
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('users')
    if d == 3:
        plt.title(f"50th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('')
    if d == 4:
        plt.title(f"75th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n   
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('users')
    if d == 5:
        plt.title(f"90th perc. ({(prev_n):.2f}-{n:,.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('')
plt.subplots_adjust(wspace=0.1)
plt.subplots_adjust(hspace=0.55)
plt.show()

##### SPAIN

In [None]:
# get only repurchases
name = 'SPAIN'
aux1 = df3[df3['user_office_name'] == name]
aux1['purchase_id2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
aux1['2nd_purchase'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_order_datetime'].shift(-1)
aux1 = aux1[aux1['purchase_id2'].notnull()]

# delta
aux1['delta_days'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() / 60 / 60 /24
aux1['delta_sec'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() 

# get users who have median threshold
aux2 = aux1[['user_buyer_id', 'delta_days']].groupby('user_buyer_id').median().reset_index()
aux2.columns = ['user_buyer_id', 'median_delta_days_user']
median_threshold= np.percentile(aux2[['median_delta_days_user']], 50)
print(f"Number of Repurchases: {len(aux1):,}") 
print(f"50th percentile: {median_threshold:.2f} days") 
# merge df
aux3 = aux1.merge(aux2, how = 'inner', on = 'user_buyer_id')
# flag users
aux3['class'] = aux3['median_delta_days_user'].apply(lambda x : 'above' if x >= median_threshold
                                        else 'below')
get_descriptive_statistics(aux2[['median_delta_days_user']])

In [None]:
# get df 
aux4 = df3.merge(aux3[['user_buyer_id', 'class']].drop_duplicates(), how = 'inner', on = 'user_buyer_id')
aux4 = aux4[aux4['user_office_name'] == name][['user_buyer_id', 'class', 'gmv_value_brl']]
# get users below threshold
aux5 = aux4[aux4['class'] == 'below'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux5.columns = ['user_buyer_id', 'below']
# get users above threshold
aux6 = aux4[aux4['class'] == 'above'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux6.columns = ['user_buyer_id', 'above']
pd.concat([get_descriptive_statistics(aux5[['below']]), get_descriptive_statistics(aux6[['above']])])

In [None]:
aux7 = aux4[['user_buyer_id', 'class', 'gmv_value_brl']].groupby(['user_buyer_id', 'class']).sum().reset_index()
plt.figure(figsize = (30,10))
for d, n in enumerate([np.percentile(aux7['gmv_value_brl'], 5), np.percentile(aux7['gmv_value_brl'], 10), 
                       np.percentile(aux7['gmv_value_brl'], 25), np.percentile(aux7['gmv_value_brl'], 50),
                       np.percentile(aux7['gmv_value_brl'], 75), np.percentile(aux7['gmv_value_brl'], 95)]):
    plt.subplot(3,2,d+1)
    if d == 0:
        aux8 = aux7[(aux7['gmv_value_brl'] <= n)]
        prev_n = n
    else:
        aux8 = aux7[(aux7['gmv_value_brl'] > prev_n) & (aux7['gmv_value_brl'] <= n)]
    sns.set_style('white')
    splot = sns.histplot(data = aux8, x = 'gmv_value_brl', hue = 'class',
                                      palette = {'below': 'darkcyan', 'above': 'darkorange'},   element = 'step')
    splot.axes.get_yaxis().set_visible(False)
    if d == 0:
        plt.title(f"5th perc. (0-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('users')
    if d == 1:
        plt.title(f"10th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        plt.xlabel('')
        plt.ylabel('')
        sns.move_legend(splot, "center left", bbox_to_anchor=(1, 0.5), title='Species')
    if d == 2:
        plt.title(f"25th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n   
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('users')
    if d == 3:
        plt.title(f"50th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('')
    if d == 4:
        plt.title(f"75th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n   
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('users')
    if d == 5:
        plt.title(f"90th perc. ({(prev_n):.2f}-{n:,.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('')
plt.subplots_adjust(wspace=0.1)
plt.subplots_adjust(hspace=0.55)
plt.show()

##### AMSTERDAM

In [None]:
# get only repurchases
name = 'AMSTERDAM'
aux1 = df3[df3['user_office_name'] == name]
aux1['purchase_id2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
aux1['2nd_purchase'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_order_datetime'].shift(-1)
aux1 = aux1[aux1['purchase_id2'].notnull()]

# delta
aux1['delta_days'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() / 60 / 60 /24
aux1['delta_sec'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() 

# get users who have median threshold
aux2 = aux1[['user_buyer_id', 'delta_days']].groupby('user_buyer_id').median().reset_index()
aux2.columns = ['user_buyer_id', 'median_delta_days_user']
median_threshold= np.percentile(aux2[['median_delta_days_user']], 50)
print(f"Number of Repurchases: {len(aux1):,}") 
print(f"50th percentile: {median_threshold:.2f} days") 
# merge df
aux3 = aux1.merge(aux2, how = 'inner', on = 'user_buyer_id')
# flag users
aux3['class'] = aux3['median_delta_days_user'].apply(lambda x : 'above' if x >= median_threshold
                                        else 'below')
get_descriptive_statistics(aux2[['median_delta_days_user']])

In [None]:
# get df 
aux4 = df3.merge(aux3[['user_buyer_id', 'class']].drop_duplicates(), how = 'inner', on = 'user_buyer_id')
aux4 = aux4[aux4['user_office_name'] == name][['user_buyer_id', 'class', 'gmv_value_brl']]
# get users below threshold
aux5 = aux4[aux4['class'] == 'below'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux5.columns = ['user_buyer_id', 'below']
# get users above threshold
aux6 = aux4[aux4['class'] == 'above'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux6.columns = ['user_buyer_id', 'above']
pd.concat([get_descriptive_statistics(aux5[['below']]), get_descriptive_statistics(aux6[['above']])])

In [None]:
aux7 = aux4[['user_buyer_id', 'class', 'gmv_value_brl']].groupby(['user_buyer_id', 'class']).sum().reset_index()
plt.figure(figsize = (30,10))
for d, n in enumerate([np.percentile(aux7['gmv_value_brl'], 5), np.percentile(aux7['gmv_value_brl'], 10), 
                       np.percentile(aux7['gmv_value_brl'], 25), np.percentile(aux7['gmv_value_brl'], 50),
                       np.percentile(aux7['gmv_value_brl'], 75), np.percentile(aux7['gmv_value_brl'], 95)]):
    plt.subplot(3,2,d+1)
    if d == 0:
        aux8 = aux7[(aux7['gmv_value_brl'] <= n)]
        prev_n = n
    else:
        aux8 = aux7[(aux7['gmv_value_brl'] > prev_n) & (aux7['gmv_value_brl'] <= n)]
    sns.set_style('white')
    splot = sns.histplot(data = aux8, x = 'gmv_value_brl', hue = 'class',
                                      palette = {'below': 'darkcyan', 'above': 'darkorange'},   element = 'step')
    splot.axes.get_yaxis().set_visible(False)
    if d == 0:
        plt.title(f"5th perc. (0-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('users')
    if d == 1:
        plt.title(f"10th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        plt.xlabel('')
        plt.ylabel('')
        sns.move_legend(splot, "center left", bbox_to_anchor=(1, 0.5), title='Species')
    if d == 2:
        plt.title(f"25th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n   
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('users')
    if d == 3:
        plt.title(f"50th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('')
    if d == 4:
        plt.title(f"75th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n   
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('users')
    if d == 5:
        plt.title(f"90th perc. ({(prev_n):.2f}-{n:,.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('')
plt.subplots_adjust(wspace=0.1)
plt.subplots_adjust(hspace=0.55)
plt.show()

##### USA

In [None]:
# get only repurchases
name = 'USA'
aux1 = df3[df3['user_office_name'] == name]
aux1['purchase_id2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
aux1['2nd_purchase'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_order_datetime'].shift(-1)
aux1 = aux1[aux1['purchase_id2'].notnull()]

# delta
aux1['delta_days'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() / 60 / 60 /24
aux1['delta_sec'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() 

# get users who have median threshold
aux2 = aux1[['user_buyer_id', 'delta_days']].groupby('user_buyer_id').median().reset_index()
aux2.columns = ['user_buyer_id', 'median_delta_days_user']
median_threshold= np.percentile(aux2[['median_delta_days_user']], 50)
print(f"Number of Repurchases: {len(aux1):,}") 
print(f"50th percentile: {median_threshold:.2f} days") 
# merge df
aux3 = aux1.merge(aux2, how = 'inner', on = 'user_buyer_id')
# flag users
aux3['class'] = aux3['median_delta_days_user'].apply(lambda x : 'above' if x >= median_threshold
                                        else 'below')
get_descriptive_statistics(aux2[['median_delta_days_user']])

In [None]:
# get df 
aux4 = df3.merge(aux3[['user_buyer_id', 'class']].drop_duplicates(), how = 'inner', on = 'user_buyer_id')
aux4 = aux4[aux4['user_office_name'] == name][['user_buyer_id', 'class', 'gmv_value_brl']]
# get users below threshold
aux5 = aux4[aux4['class'] == 'below'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux5.columns = ['user_buyer_id', 'below']
# get users above threshold
aux6 = aux4[aux4['class'] == 'above'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux6.columns = ['user_buyer_id', 'above']
pd.concat([get_descriptive_statistics(aux5[['below']]), get_descriptive_statistics(aux6[['above']])])

In [None]:
aux7 = aux4[['user_buyer_id', 'class', 'gmv_value_brl']].groupby(['user_buyer_id', 'class']).sum().reset_index()
plt.figure(figsize = (30,10))
for d, n in enumerate([np.percentile(aux7['gmv_value_brl'], 5), np.percentile(aux7['gmv_value_brl'], 10), 
                       np.percentile(aux7['gmv_value_brl'], 25), np.percentile(aux7['gmv_value_brl'], 50),
                       np.percentile(aux7['gmv_value_brl'], 75), np.percentile(aux7['gmv_value_brl'], 95)]):
    plt.subplot(3,2,d+1)
    if d == 0:
        aux8 = aux7[(aux7['gmv_value_brl'] <= n)]
        prev_n = n
    else:
        aux8 = aux7[(aux7['gmv_value_brl'] > prev_n) & (aux7['gmv_value_brl'] <= n)]
    sns.set_style('white')
    splot = sns.histplot(data = aux8, x = 'gmv_value_brl', hue = 'class',
                                      palette = {'below': 'darkcyan', 'above': 'darkorange'},   element = 'step')
    splot.axes.get_yaxis().set_visible(False)
    if d == 0:
        plt.title(f"5th perc. (0-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('users')
    if d == 1:
        plt.title(f"10th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        plt.xlabel('')
        plt.ylabel('')
        sns.move_legend(splot, "center left", bbox_to_anchor=(1, 0.5), title='Species')
    if d == 2:
        plt.title(f"25th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n   
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('users')
    if d == 3:
        plt.title(f"50th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('')
    if d == 4:
        plt.title(f"75th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n   
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('users')
    if d == 5:
        plt.title(f"90th perc. ({(prev_n):.2f}-{n:,.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('')
plt.subplots_adjust(wspace=0.1)
plt.subplots_adjust(hspace=0.55)
plt.show()

##### MEXICO

In [None]:
# get only repurchases
name = 'MEXICO'
aux1 = df3[df3['user_office_name'] == name]
aux1['purchase_id2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
aux1['2nd_purchase'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_order_datetime'].shift(-1)
aux1 = aux1[aux1['purchase_id2'].notnull()]

# delta
aux1['delta_days'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() / 60 / 60 /24
aux1['delta_sec'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() 

# get users who have median threshold
aux2 = aux1[['user_buyer_id', 'delta_days']].groupby('user_buyer_id').median().reset_index()
aux2.columns = ['user_buyer_id', 'median_delta_days_user']
median_threshold= np.percentile(aux2[['median_delta_days_user']], 50)
print(f"Number of Repurchases: {len(aux1):,}") 
print(f"50th percentile: {median_threshold:.2f} days") 
# merge df
aux3 = aux1.merge(aux2, how = 'inner', on = 'user_buyer_id')
# flag users
aux3['class'] = aux3['median_delta_days_user'].apply(lambda x : 'above' if x >= median_threshold
                                        else 'below')
get_descriptive_statistics(aux2[['median_delta_days_user']])

In [None]:
# get df 
aux4 = df3.merge(aux3[['user_buyer_id', 'class']].drop_duplicates(), how = 'inner', on = 'user_buyer_id')
aux4 = aux4[aux4['user_office_name'] == name][['user_buyer_id', 'class', 'gmv_value_brl']]
# get users below threshold
aux5 = aux4[aux4['class'] == 'below'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux5.columns = ['user_buyer_id', 'below']
# get users above threshold
aux6 = aux4[aux4['class'] == 'above'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux6.columns = ['user_buyer_id', 'above']
pd.concat([get_descriptive_statistics(aux5[['below']]), get_descriptive_statistics(aux6[['above']])])

In [None]:
aux7 = aux4[['user_buyer_id', 'class', 'gmv_value_brl']].groupby(['user_buyer_id', 'class']).sum().reset_index()
plt.figure(figsize = (30,10))
for d, n in enumerate([np.percentile(aux7['gmv_value_brl'], 5), np.percentile(aux7['gmv_value_brl'], 10), 
                       np.percentile(aux7['gmv_value_brl'], 25), np.percentile(aux7['gmv_value_brl'], 50),
                       np.percentile(aux7['gmv_value_brl'], 75), np.percentile(aux7['gmv_value_brl'], 95)]):
    plt.subplot(3,2,d+1)
    if d == 0:
        aux8 = aux7[(aux7['gmv_value_brl'] <= n)]
        prev_n = n
    else:
        aux8 = aux7[(aux7['gmv_value_brl'] > prev_n) & (aux7['gmv_value_brl'] <= n)]
    sns.set_style('white')
    splot = sns.histplot(data = aux8, x = 'gmv_value_brl', hue = 'class',
                                      palette = {'below': 'darkcyan', 'above': 'darkorange'},   element = 'step')
    splot.axes.get_yaxis().set_visible(False)
    if d == 0:
        plt.title(f"5th perc. (0-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('users')
    if d == 1:
        plt.title(f"10th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        plt.xlabel('')
        plt.ylabel('')
        sns.move_legend(splot, "center left", bbox_to_anchor=(1, 0.5), title='Species')
    if d == 2:
        plt.title(f"25th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n   
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('users')
    if d == 3:
        plt.title(f"50th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('')
    if d == 4:
        plt.title(f"75th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n   
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('users')
    if d == 5:
        plt.title(f"90th perc. ({(prev_n):.2f}-{n:,.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('')
plt.subplots_adjust(wspace=0.1)
plt.subplots_adjust(hspace=0.55)
plt.show()

##### COLOMBIA

In [None]:
# get only repurchases
name = 'COLOMBIA'
aux1 = df3[df3['user_office_name'] == name]
aux1['purchase_id2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
aux1['2nd_purchase'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_order_datetime'].shift(-1)
aux1 = aux1[aux1['purchase_id2'].notnull()]

# delta
aux1['delta_days'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() / 60 / 60 /24
aux1['delta_sec'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() 

# get users who have median threshold
aux2 = aux1[['user_buyer_id', 'delta_days']].groupby('user_buyer_id').median().reset_index()
aux2.columns = ['user_buyer_id', 'median_delta_days_user']
median_threshold= np.percentile(aux2[['median_delta_days_user']], 50)
print(f"Number of Repurchases: {len(aux1):,}") 
print(f"50th percentile: {median_threshold:.2f} days") 
# merge df
aux3 = aux1.merge(aux2, how = 'inner', on = 'user_buyer_id')
# flag users
aux3['class'] = aux3['median_delta_days_user'].apply(lambda x : 'above' if x >= median_threshold
                                        else 'below')
get_descriptive_statistics(aux2[['median_delta_days_user']])

In [None]:
# get df 
aux4 = df3.merge(aux3[['user_buyer_id', 'class']].drop_duplicates(), how = 'inner', on = 'user_buyer_id')
aux4 = aux4[aux4['user_office_name'] == name][['user_buyer_id', 'class', 'gmv_value_brl']]
# get users below threshold
aux5 = aux4[aux4['class'] == 'below'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux5.columns = ['user_buyer_id', 'below']
# get users above threshold
aux6 = aux4[aux4['class'] == 'above'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux6.columns = ['user_buyer_id', 'above']
pd.concat([get_descriptive_statistics(aux5[['below']]), get_descriptive_statistics(aux6[['above']])])

In [None]:
aux7 = aux4[['user_buyer_id', 'class', 'gmv_value_brl']].groupby(['user_buyer_id', 'class']).sum().reset_index()
plt.figure(figsize = (30,10))
for d, n in enumerate([np.percentile(aux7['gmv_value_brl'], 5), np.percentile(aux7['gmv_value_brl'], 10), 
                       np.percentile(aux7['gmv_value_brl'], 25), np.percentile(aux7['gmv_value_brl'], 50),
                       np.percentile(aux7['gmv_value_brl'], 75), np.percentile(aux7['gmv_value_brl'], 95)]):
    plt.subplot(3,2,d+1)
    if d == 0:
        aux8 = aux7[(aux7['gmv_value_brl'] <= n)]
        prev_n = n
    else:
        aux8 = aux7[(aux7['gmv_value_brl'] > prev_n) & (aux7['gmv_value_brl'] <= n)]
    sns.set_style('white')
    splot = sns.histplot(data = aux8, x = 'gmv_value_brl', hue = 'class',
                                      palette = {'below': 'darkcyan', 'above': 'darkorange'},   element = 'step')
    splot.axes.get_yaxis().set_visible(False)
    if d == 0:
        plt.title(f"5th perc. (0-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('users')
    if d == 1:
        plt.title(f"10th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        plt.xlabel('')
        plt.ylabel('')
        sns.move_legend(splot, "center left", bbox_to_anchor=(1, 0.5), title='Species')
    if d == 2:
        plt.title(f"25th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n   
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('users')
    if d == 3:
        plt.title(f"50th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('')
    if d == 4:
        plt.title(f"75th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n   
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('users')
    if d == 5:
        plt.title(f"90th perc. ({(prev_n):.2f}-{n:,.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('')
plt.subplots_adjust(wspace=0.1)
plt.subplots_adjust(hspace=0.55)
plt.show()

#### Producer segment

In [None]:
# get producer office
plt.figure(figsize = (30,10))
descr_stats = pd.DataFrame()

for n, d in enumerate(['SEED', 'SMALL', 'MEDIUM', 'LARGE']):
    plt.subplot(1,4,n+1)
    # get only repurchases
    aux1 = df3[df3['segment'] == d]
    aux1['purchase_id2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
    aux1['2nd_purchase'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_order_datetime'].shift(-1)
    aux1 = aux1[aux1['purchase_id2'].notnull()]

    # delta
    aux1['delta_days'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() / 60 / 60 /24
    aux1['delta_sec'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() 
    # get median delta per user
    aux2 = aux1[['user_buyer_id', 'delta_days']].groupby('user_buyer_id').median().reset_index()
    aux2.columns = ['user_buyer_id', 'median_delta_days_user']
    median_threshold= np.percentile(aux2[['median_delta_days_user']], 50)
    aux3 = aux2.copy()
    aux3.columns = ['user_buyer_id', d]
    descr_stats = descr_stats.append(get_descriptive_statistics(aux3[[d]]))
    
    sns.set_style('white')
    splot = sns.kdeplot(data = aux2, x = 'median_delta_days_user', color = '#EF4E23', fill=True, alpha=.1, linewidth=2
            )
    plt.title(f"{d} | N: {(len(aux2)):,}")

    label_median = f"median ({aux2['median_delta_days_user'].median():.1f}d)"
    plt.axvline(x=aux2['median_delta_days_user'].median(), label = label_median, color = 'black')
    plt.legend()
    plt.xlabel('median delta (days)')
    plt.ylabel('')    
    if n == 0:
        plt.ylabel('buyers') 

# plt.subplots_adjust(hspace=0.3)
plt.subplots_adjust(wspace=0.4)
plt.show()

In [None]:
# median time to repurchase
descr_stats

#### Producer segment - GMV from users who repurchase above median time vs below

##### SEED

In [None]:
# get only repurchases
name = 'SEED'
aux1 = df3[df3['segment'] == name]
aux1['purchase_id2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
aux1['2nd_purchase'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_order_datetime'].shift(-1)
aux1 = aux1[aux1['purchase_id2'].notnull()]

# delta
aux1['delta_days'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() / 60 / 60 /24
aux1['delta_sec'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() 

# get users who have median threshold
aux2 = aux1[['user_buyer_id', 'delta_days']].groupby('user_buyer_id').median().reset_index()
aux2.columns = ['user_buyer_id', 'median_delta_days_user']
median_threshold= np.percentile(aux2[['median_delta_days_user']], 50)
print(f"Number of Repurchases: {len(aux1):,}") 
print(f"50th percentile: {median_threshold:.2f} days") 
# merge df
aux3 = aux1.merge(aux2, how = 'inner', on = 'user_buyer_id')
# flag users
aux3['class'] = aux3['median_delta_days_user'].apply(lambda x : 'above' if x >= median_threshold
                                        else 'below')
get_descriptive_statistics(aux2[['median_delta_days_user']])

In [None]:
# get df 
aux4 = df3.merge(aux3[['user_buyer_id', 'class']].drop_duplicates(), how = 'inner', on = 'user_buyer_id')
aux4 = aux4[aux4['segment'] == name][['user_buyer_id', 'class', 'gmv_value_brl']]
# get users below threshold
aux5 = aux4[aux4['class'] == 'below'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux5.columns = ['user_buyer_id', 'below']
# get users above threshold
aux6 = aux4[aux4['class'] == 'above'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux6.columns = ['user_buyer_id', 'above']
pd.concat([get_descriptive_statistics(aux5[['below']]), get_descriptive_statistics(aux6[['above']])])

In [None]:
aux7 = aux4[['user_buyer_id', 'class', 'gmv_value_brl']].groupby(['user_buyer_id', 'class']).sum().reset_index()
plt.figure(figsize = (30,10))
for d, n in enumerate([np.percentile(aux7['gmv_value_brl'], 5), np.percentile(aux7['gmv_value_brl'], 10), 
                       np.percentile(aux7['gmv_value_brl'], 25), np.percentile(aux7['gmv_value_brl'], 50),
                       np.percentile(aux7['gmv_value_brl'], 75), np.percentile(aux7['gmv_value_brl'], 95)]):
    plt.subplot(3,2,d+1)
    if d == 0:
        aux8 = aux7[(aux7['gmv_value_brl'] <= n)]
        prev_n = n
    else:
        aux8 = aux7[(aux7['gmv_value_brl'] > prev_n) & (aux7['gmv_value_brl'] <= n)]
    sns.set_style('white')
    splot = sns.histplot(data = aux8, x = 'gmv_value_brl', hue = 'class',
                                      palette = {'below': 'darkcyan', 'above': 'darkorange'},   element = 'step')
    splot.axes.get_yaxis().set_visible(False)
    if d == 0:
        plt.title(f"5th perc. (0-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('users')
    if d == 1:
        plt.title(f"10th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        plt.xlabel('')
        plt.ylabel('')
        sns.move_legend(splot, "center left", bbox_to_anchor=(1, 0.5), title='Species')
    if d == 2:
        plt.title(f"25th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n   
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('users')
    if d == 3:
        plt.title(f"50th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('')
    if d == 4:
        plt.title(f"75th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n   
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('users')
    if d == 5:
        plt.title(f"90th perc. ({(prev_n):.2f}-{n:,.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('')
plt.subplots_adjust(wspace=0.1)
plt.subplots_adjust(hspace=0.55)
plt.show()

##### SMALL

In [None]:
# get only repurchases
name = 'SMALL'
aux1 = df3[df3['segment'] == name]
aux1['purchase_id2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
aux1['2nd_purchase'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_order_datetime'].shift(-1)
aux1 = aux1[aux1['purchase_id2'].notnull()]

# delta
aux1['delta_days'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() / 60 / 60 /24
aux1['delta_sec'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() 

# get users who have median threshold
aux2 = aux1[['user_buyer_id', 'delta_days']].groupby('user_buyer_id').median().reset_index()
aux2.columns = ['user_buyer_id', 'median_delta_days_user']
median_threshold= np.percentile(aux2[['median_delta_days_user']], 50)
print(f"Number of Repurchases: {len(aux1):,}") 
print(f"50th percentile: {median_threshold:.2f} days") 
# merge df
aux3 = aux1.merge(aux2, how = 'inner', on = 'user_buyer_id')
# flag users
aux3['class'] = aux3['median_delta_days_user'].apply(lambda x : 'above' if x >= median_threshold
                                        else 'below')
get_descriptive_statistics(aux2[['median_delta_days_user']])

In [None]:
# get df 
aux4 = df3.merge(aux3[['user_buyer_id', 'class']].drop_duplicates(), how = 'inner', on = 'user_buyer_id')
aux4 = aux4[aux4['segment'] == name][['user_buyer_id', 'class', 'gmv_value_brl']]
# get users below threshold
aux5 = aux4[aux4['class'] == 'below'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux5.columns = ['user_buyer_id', 'below']
# get users above threshold
aux6 = aux4[aux4['class'] == 'above'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux6.columns = ['user_buyer_id', 'above']
pd.concat([get_descriptive_statistics(aux5[['below']]), get_descriptive_statistics(aux6[['above']])])

In [None]:
aux7 = aux4[['user_buyer_id', 'class', 'gmv_value_brl']].groupby(['user_buyer_id', 'class']).sum().reset_index()
plt.figure(figsize = (30,10))
for d, n in enumerate([np.percentile(aux7['gmv_value_brl'], 5), np.percentile(aux7['gmv_value_brl'], 10), 
                       np.percentile(aux7['gmv_value_brl'], 25), np.percentile(aux7['gmv_value_brl'], 50),
                       np.percentile(aux7['gmv_value_brl'], 75), np.percentile(aux7['gmv_value_brl'], 95)]):
    plt.subplot(3,2,d+1)
    if d == 0:
        aux8 = aux7[(aux7['gmv_value_brl'] <= n)]
        prev_n = n
    else:
        aux8 = aux7[(aux7['gmv_value_brl'] > prev_n) & (aux7['gmv_value_brl'] <= n)]
    sns.set_style('white')
    splot = sns.histplot(data = aux8, x = 'gmv_value_brl', hue = 'class',
                                      palette = {'below': 'darkcyan', 'above': 'darkorange'},   element = 'step')
    splot.axes.get_yaxis().set_visible(False)
    if d == 0:
        plt.title(f"5th perc. (0-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('users')
    if d == 1:
        plt.title(f"10th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        plt.xlabel('')
        plt.ylabel('')
        sns.move_legend(splot, "center left", bbox_to_anchor=(1, 0.5), title='Species')
    if d == 2:
        plt.title(f"25th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n   
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('users')
    if d == 3:
        plt.title(f"50th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('')
    if d == 4:
        plt.title(f"75th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n   
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('users')
    if d == 5:
        plt.title(f"90th perc. ({(prev_n):.2f}-{n:,.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('')
plt.subplots_adjust(wspace=0.1)
plt.subplots_adjust(hspace=0.55)
plt.show()

##### MEDIUM

In [None]:
# get only repurchases
name = 'MEDIUM'
aux1 = df3[df3['segment'] == name]
aux1['purchase_id2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
aux1['2nd_purchase'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_order_datetime'].shift(-1)
aux1 = aux1[aux1['purchase_id2'].notnull()]

# delta
aux1['delta_days'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() / 60 / 60 /24
aux1['delta_sec'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() 

# get users who have median threshold
aux2 = aux1[['user_buyer_id', 'delta_days']].groupby('user_buyer_id').median().reset_index()
aux2.columns = ['user_buyer_id', 'median_delta_days_user']
median_threshold= np.percentile(aux2[['median_delta_days_user']], 50)
print(f"Number of Repurchases: {len(aux1):,}") 
print(f"50th percentile: {median_threshold:.2f} days") 
# merge df
aux3 = aux1.merge(aux2, how = 'inner', on = 'user_buyer_id')
# flag users
aux3['class'] = aux3['median_delta_days_user'].apply(lambda x : 'above' if x >= median_threshold
                                        else 'below')
get_descriptive_statistics(aux2[['median_delta_days_user']])

In [None]:
# get df 
aux4 = df3.merge(aux3[['user_buyer_id', 'class']].drop_duplicates(), how = 'inner', on = 'user_buyer_id')
aux4 = aux4[aux4['segment'] == name][['user_buyer_id', 'class', 'gmv_value_brl']]
# get users below threshold
aux5 = aux4[aux4['class'] == 'below'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux5.columns = ['user_buyer_id', 'below']
# get users above threshold
aux6 = aux4[aux4['class'] == 'above'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux6.columns = ['user_buyer_id', 'above']
pd.concat([get_descriptive_statistics(aux5[['below']]), get_descriptive_statistics(aux6[['above']])])

In [None]:
aux7 = aux4[['user_buyer_id', 'class', 'gmv_value_brl']].groupby(['user_buyer_id', 'class']).sum().reset_index()
plt.figure(figsize = (30,10))
for d, n in enumerate([np.percentile(aux7['gmv_value_brl'], 5), np.percentile(aux7['gmv_value_brl'], 10), 
                       np.percentile(aux7['gmv_value_brl'], 25), np.percentile(aux7['gmv_value_brl'], 50),
                       np.percentile(aux7['gmv_value_brl'], 75), np.percentile(aux7['gmv_value_brl'], 95)]):
    plt.subplot(3,2,d+1)
    if d == 0:
        aux8 = aux7[(aux7['gmv_value_brl'] <= n)]
        prev_n = n
    else:
        aux8 = aux7[(aux7['gmv_value_brl'] > prev_n) & (aux7['gmv_value_brl'] <= n)]
    sns.set_style('white')
    splot = sns.histplot(data = aux8, x = 'gmv_value_brl', hue = 'class',
                                      palette = {'below': 'darkcyan', 'above': 'darkorange'},   element = 'step')
    splot.axes.get_yaxis().set_visible(False)
    if d == 0:
        plt.title(f"5th perc. (0-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('users')
    if d == 1:
        plt.title(f"10th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        plt.xlabel('')
        plt.ylabel('')
        sns.move_legend(splot, "center left", bbox_to_anchor=(1, 0.5), title='Species')
    if d == 2:
        plt.title(f"25th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n   
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('users')
    if d == 3:
        plt.title(f"50th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('')
    if d == 4:
        plt.title(f"75th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n   
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('users')
    if d == 5:
        plt.title(f"90th perc. ({(prev_n):.2f}-{n:,.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('')
plt.subplots_adjust(wspace=0.1)
plt.subplots_adjust(hspace=0.55)
plt.show()

##### LARGE

In [None]:
# get only repurchases
name = 'LARGE'
aux1 = df3[df3['segment'] == name]
aux1['purchase_id2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
aux1['2nd_purchase'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_order_datetime'].shift(-1)
aux1 = aux1[aux1['purchase_id2'].notnull()]

# delta
aux1['delta_days'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() / 60 / 60 /24
aux1['delta_sec'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() 

# get users who have median threshold
aux2 = aux1[['user_buyer_id', 'delta_days']].groupby('user_buyer_id').median().reset_index()
aux2.columns = ['user_buyer_id', 'median_delta_days_user']
median_threshold= np.percentile(aux2[['median_delta_days_user']], 50)
print(f"Number of Repurchases: {len(aux1):,}") 
print(f"50th percentile: {median_threshold:.2f} days") 
# merge df
aux3 = aux1.merge(aux2, how = 'inner', on = 'user_buyer_id')
# flag users
aux3['class'] = aux3['median_delta_days_user'].apply(lambda x : 'above' if x >= median_threshold
                                        else 'below')
get_descriptive_statistics(aux2[['median_delta_days_user']])

In [None]:
# get df 
aux4 = df3.merge(aux3[['user_buyer_id', 'class']].drop_duplicates(), how = 'inner', on = 'user_buyer_id')
aux4 = aux4[aux4['segment'] == name][['user_buyer_id', 'class', 'gmv_value_brl']]
# get users below threshold
aux5 = aux4[aux4['class'] == 'below'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux5.columns = ['user_buyer_id', 'below']
# get users above threshold
aux6 = aux4[aux4['class'] == 'above'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux6.columns = ['user_buyer_id', 'above']
pd.concat([get_descriptive_statistics(aux5[['below']]), get_descriptive_statistics(aux6[['above']])])

In [None]:
aux7 = aux4[['user_buyer_id', 'class', 'gmv_value_brl']].groupby(['user_buyer_id', 'class']).sum().reset_index()
plt.figure(figsize = (30,10))
for d, n in enumerate([np.percentile(aux7['gmv_value_brl'], 5), np.percentile(aux7['gmv_value_brl'], 10), 
                       np.percentile(aux7['gmv_value_brl'], 25), np.percentile(aux7['gmv_value_brl'], 50),
                       np.percentile(aux7['gmv_value_brl'], 75), np.percentile(aux7['gmv_value_brl'], 95)]):
    plt.subplot(3,2,d+1)
    if d == 0:
        aux8 = aux7[(aux7['gmv_value_brl'] <= n)]
        prev_n = n
    else:
        aux8 = aux7[(aux7['gmv_value_brl'] > prev_n) & (aux7['gmv_value_brl'] <= n)]
    sns.set_style('white')
    splot = sns.histplot(data = aux8, x = 'gmv_value_brl', hue = 'class',
                                      palette = {'below': 'darkcyan', 'above': 'darkorange'},   element = 'step')
    splot.axes.get_yaxis().set_visible(False)
    if d == 0:
        plt.title(f"5th perc. (0-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('users')
    if d == 1:
        plt.title(f"10th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        plt.xlabel('')
        plt.ylabel('')
        sns.move_legend(splot, "center left", bbox_to_anchor=(1, 0.5), title='Species')
    if d == 2:
        plt.title(f"25th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n   
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('users')
    if d == 3:
        plt.title(f"50th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('')
    if d == 4:
        plt.title(f"75th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n   
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('users')
    if d == 5:
        plt.title(f"90th perc. ({(prev_n):.2f}-{n:,.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('')
plt.subplots_adjust(wspace=0.1)
plt.subplots_adjust(hspace=0.55)
plt.show()

#### Producers < 10k BRL LTV

In [None]:
# get producer office
plt.figure(figsize = (30,10))
descr_stats = pd.DataFrame()
for n, d in enumerate(['below', 'above']):
    plt.subplot(1,2,n+1)
    # get only repurchases
    aux1 = df3[df3['is_below_10_entire'] == d]
    aux1['purchase_id2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
    aux1['2nd_purchase'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_order_datetime'].shift(-1)
    aux1 = aux1[aux1['purchase_id2'].notnull()]

    # delta
    aux1['delta_days'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() / 60 / 60 /24
    aux1['delta_sec'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() 
    # get median delta per user
    aux2 = aux1[['user_buyer_id', 'delta_days']].groupby('user_buyer_id').median().reset_index()
    aux2.columns = ['user_buyer_id', 'median_delta_days_user']
    median_threshold= np.percentile(aux2[['median_delta_days_user']], 50)
    aux3 = aux2.copy()
    aux3.columns = ['user_buyer_id', d]
    descr_stats = descr_stats.append(get_descriptive_statistics(aux3[[d]]))


    sns.set_style('white')
    sns.histplot(data = aux2, x = 'median_delta_days_user', 
                kde = True, color = 'darkcyan'
                )
    plt.title(f"{d} | N: {(len(aux2)):,}")

    label_median = f"median ({aux2['median_delta_days_user'].median():.1f}d)"
    plt.axvline(x=aux2['median_delta_days_user'].median(), label = label_median, color = 'black')
    plt.legend()
    plt.xlabel('delta (days)')
    plt.ylabel('')    
    if n == 0:
        plt.ylabel('customers') 

plt.subplots_adjust(hspace=0.3)
# plt.subplots_adjust(wspace=0.2)
plt.show()

In [None]:
# median time to repurchase
descr_stats

#### Producers < 10k BRL LTV - GMV from users who repurchase above median time vs below

##### below

In [None]:
# get only repurchases
name = 'below'
aux1 = df3[df3['is_below_10_entire'] == name]
aux1['purchase_id2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
aux1['2nd_purchase'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_order_datetime'].shift(-1)
aux1 = aux1[aux1['purchase_id2'].notnull()]

# delta
aux1['delta_days'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() / 60 / 60 /24
aux1['delta_sec'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() 

# get users who have median threshold
aux2 = aux1[['user_buyer_id', 'delta_days']].groupby('user_buyer_id').median().reset_index()
aux2.columns = ['user_buyer_id', 'median_delta_days_user']
median_threshold= np.percentile(aux2[['median_delta_days_user']], 50)
print(f"Number of Repurchases: {len(aux1):,}") 
print(f"50th percentile: {median_threshold:.2f} days") 
# merge df
aux3 = aux1.merge(aux2, how = 'inner', on = 'user_buyer_id')
# flag users
aux3['class'] = aux3['median_delta_days_user'].apply(lambda x : 'above' if x >= median_threshold
                                        else 'below')
get_descriptive_statistics(aux2[['median_delta_days_user']])

In [None]:
# get df 
aux4 = df3.merge(aux3[['user_buyer_id', 'class']].drop_duplicates(), how = 'inner', on = 'user_buyer_id')
aux4 = aux4[aux4['is_below_10_entire'] == name][['user_buyer_id', 'class', 'gmv_value_brl']]
# get users below threshold
aux5 = aux4[aux4['class'] == 'below'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux5.columns = ['user_buyer_id', 'below']
# get users above threshold
aux6 = aux4[aux4['class'] == 'above'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux6.columns = ['user_buyer_id', 'above']
pd.concat([get_descriptive_statistics(aux5[['below']]), get_descriptive_statistics(aux6[['above']])])

In [None]:
aux7 = aux4[['user_buyer_id', 'class', 'gmv_value_brl']].groupby(['user_buyer_id', 'class']).sum().reset_index()
plt.figure(figsize = (30,10))
for d, n in enumerate([np.percentile(aux7['gmv_value_brl'], 5), np.percentile(aux7['gmv_value_brl'], 10), 
                       np.percentile(aux7['gmv_value_brl'], 25), np.percentile(aux7['gmv_value_brl'], 50),
                       np.percentile(aux7['gmv_value_brl'], 75), np.percentile(aux7['gmv_value_brl'], 95)]):
    plt.subplot(3,2,d+1)
    if d == 0:
        aux8 = aux7[(aux7['gmv_value_brl'] <= n)]
        prev_n = n
    else:
        aux8 = aux7[(aux7['gmv_value_brl'] > prev_n) & (aux7['gmv_value_brl'] <= n)]
    sns.set_style('white')
    splot = sns.histplot(data = aux8, x = 'gmv_value_brl', hue = 'class',
                                      palette = {'below': 'darkcyan', 'above': 'darkorange'},   element = 'step')
    splot.axes.get_yaxis().set_visible(False)
    if d == 0:
        plt.title(f"5th perc. (0-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('users')
    if d == 1:
        plt.title(f"10th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        plt.xlabel('')
        plt.ylabel('')
        sns.move_legend(splot, "center left", bbox_to_anchor=(1, 0.5), title='Species')
    if d == 2:
        plt.title(f"25th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n   
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('users')
    if d == 3:
        plt.title(f"50th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('')
    if d == 4:
        plt.title(f"75th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n   
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('users')
    if d == 5:
        plt.title(f"90th perc. ({(prev_n):.2f}-{n:,.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('')
plt.subplots_adjust(wspace=0.1)
plt.subplots_adjust(hspace=0.55)
plt.show()

##### above

In [None]:
# get only repurchases
name = 'above'
aux1 = df3[df3['is_below_10_entire'] == name]
aux1['purchase_id2'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_id'].shift(-1)
aux1['2nd_purchase'] = aux1.sort_values('purchase_order_datetime').groupby('user_buyer_id')['purchase_order_datetime'].shift(-1)
aux1 = aux1[aux1['purchase_id2'].notnull()]

# delta
aux1['delta_days'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() / 60 / 60 /24
aux1['delta_sec'] = (aux1['2nd_purchase'] - aux1['purchase_order_datetime']).dt.total_seconds() 

# get users who have median threshold
aux2 = aux1[['user_buyer_id', 'delta_days']].groupby('user_buyer_id').median().reset_index()
aux2.columns = ['user_buyer_id', 'median_delta_days_user']
median_threshold= np.percentile(aux2[['median_delta_days_user']], 50)
print(f"Number of Repurchases: {len(aux1):,}") 
print(f"50th percentile: {median_threshold:.2f} days") 
# merge df
aux3 = aux1.merge(aux2, how = 'inner', on = 'user_buyer_id')
# flag users
aux3['class'] = aux3['median_delta_days_user'].apply(lambda x : 'above' if x >= median_threshold
                                        else 'below')
get_descriptive_statistics(aux2[['median_delta_days_user']])

In [None]:
# get df 
aux4 = df3.merge(aux3[['user_buyer_id', 'class']].drop_duplicates(), how = 'inner', on = 'user_buyer_id')
aux4 = aux4[aux4['is_below_10_entire'] == name][['user_buyer_id', 'class', 'gmv_value_brl']]
# get users below threshold
aux5 = aux4[aux4['class'] == 'below'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux5.columns = ['user_buyer_id', 'below']
# get users above threshold
aux6 = aux4[aux4['class'] == 'above'][['user_buyer_id', 'gmv_value_brl']].groupby('user_buyer_id').sum().reset_index()
aux6.columns = ['user_buyer_id', 'above']
pd.concat([get_descriptive_statistics(aux5[['below']]), get_descriptive_statistics(aux6[['above']])])

In [None]:
aux7 = aux4[['user_buyer_id', 'class', 'gmv_value_brl']].groupby(['user_buyer_id', 'class']).sum().reset_index()
plt.figure(figsize = (30,10))
for d, n in enumerate([np.percentile(aux7['gmv_value_brl'], 5), np.percentile(aux7['gmv_value_brl'], 10), 
                       np.percentile(aux7['gmv_value_brl'], 25), np.percentile(aux7['gmv_value_brl'], 50),
                       np.percentile(aux7['gmv_value_brl'], 75), np.percentile(aux7['gmv_value_brl'], 95)]):
    plt.subplot(3,2,d+1)
    if d == 0:
        aux8 = aux7[(aux7['gmv_value_brl'] <= n)]
        prev_n = n
    else:
        aux8 = aux7[(aux7['gmv_value_brl'] > prev_n) & (aux7['gmv_value_brl'] <= n)]
    sns.set_style('white')
    splot = sns.histplot(data = aux8, x = 'gmv_value_brl', hue = 'class',
                                      palette = {'below': 'darkcyan', 'above': 'darkorange'},   element = 'step')
    splot.axes.get_yaxis().set_visible(False)
    if d == 0:
        plt.title(f"5th perc. (0-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('users')
    if d == 1:
        plt.title(f"10th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        plt.xlabel('')
        plt.ylabel('')
        sns.move_legend(splot, "center left", bbox_to_anchor=(1, 0.5), title='Species')
    if d == 2:
        plt.title(f"25th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n   
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('users')
    if d == 3:
        plt.title(f"50th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('')
        plt.ylabel('')
    if d == 4:
        plt.title(f"75th perc. ({(prev_n):.2f}-{n:.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n   
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('users')
    if d == 5:
        plt.title(f"90th perc. ({(prev_n):.2f}-{n:,.2f} BRL) | Users: {aux8['user_buyer_id'].nunique():,}")
        prev_n = n
        splot.axes.get_yaxis().set_visible(True)
        splot.get_legend().remove()
        plt.xlabel('Customer GMV LTV')
        plt.ylabel('')
plt.subplots_adjust(wspace=0.1)
plt.subplots_adjust(hspace=0.55)
plt.show()

### Do customers repurchase more in a specific period?

### Which period do customers increase their LTV the most?

### For how long (on avg.) a single customer generates revenue for a specific producer?

## Sales Type (Launch vs Evergreen) (ON-HOLD)

## Payment Type (Subscriptions vs single-payments)

### How many customers repurchase a 2nd time given that 1st purchase was a subscription?


### How many customers repurchase a 2nd time given that 1st purchase was a single-payment?



### What is the most successful payment method in terms of re-order rate?



### Is GMV from monthly subscriptions higher than annual? How about ticket price?



### How many customers do not renew their subscription? 



### When a product is expensive, do customers tend to pay at 2x or more installments? (remove split code)



### Do expensive products increase credit card approval rate?

## Coupon

### Does coupon at 1st purchase increase the probability of recurrence?


## Recovery Tools (ON-HOLD)

### How impactful recovery tools are on LTV per customer?


### Customers who were recovered: do they purchase a 2nd time?


## Producer

### Do customers repurchase a 2nd time from the same producer if ticket price is lower (in comparison to 1st product)?


### How many users who purchased at 1st time have signed-up beforehand? If size is relevant, do users purchase more products from the same niche, same producer, or different?


## Product

### Do customers with high LTV purchase products with higher avg. ticket price? Or do they have high LTV due to recurrence? 



### Do customers who bought a high rating product repurchase more? (Club)




## Customer

### Does age influence LTV? (ON-HOLD)


### Does a specific device influence LTV?


## Checkout - Conversion Rate

### How many users ask for a reimbursement right after purchasing? (check 1st purchase)


## Traffic & Product Type (ON-HOLD)