### Load Dataset

In [1]:
import pandas as pd
import numpy as np

## Takes a few seconds to load
file_path = '../data/raw/Orders_Master_Data(in).xlsx'
raw_data = pd.read_excel(file_path) 

### Dataset Cleaning

In [2]:
orders_raw = raw_data.copy()
orders_raw.columns = orders_raw.columns.str.replace(' ', '_')
orders_raw.columns = orders_raw.columns.str.lower()
orders_raw = orders_raw.drop_duplicates()

orders_raw['date'] = pd.to_datetime(orders_raw['date'], format='%d.%m.%Y')
orders_raw.set_index('date', inplace=True)
orders_raw.rename(columns={"median_ticket_(‚ç¨)": "median_ticket",
                   "prom_contacts_month":"promotor_visits",
                   "tel_contacts_month":"promotor_calls"},
                   inplace=True)

orders_raw['city'] =  orders_raw['city'].apply(lambda x: 'Cadiz' if x=='C√°diz' else x)
orders_raw['city'] =  orders_raw['city'].apply(lambda x: 'Castellon' if x=='Castell√≥n' else x)
orders_raw['city'] =  orders_raw['city'].apply(lambda x: 'Cordoba' if x=='C√≥rdoba' else x)

# Time Features
orders_raw['day_of_week'] = orders_raw.index.dayofweek
orders_raw['month'] = orders_raw.index.month
orders_raw['week'] = orders_raw.index.isocalendar().week

# Cost Calculation
logistics_cost = 10 #EUR per order
visit_cost = 15 #EUR per visit

orders_raw['order_normalized'] = np.where(orders_raw['number_of_orders'] > 0, 1, 0) 
orders_raw['cost'] = orders_raw['order_normalized'] * logistics_cost + orders_raw['promotor_visits'] * visit_cost
orders_raw['profit'] = orders_raw['income'] - orders_raw['cost']

### Final Orders df

In [3]:
# Reorganize columns in a logical order
column_order = [
    # Client & Geographic Information
    'client_id',
    'city', 
    'channel',
    'promotor_id',
    
    # Core Business Metrics
    'number_of_orders',
    'volume',
    'income',
    'median_ticket',
    
    # Contact & Visit Metrics
    'promotor_visits',
    'promotor_calls',
    
    # Calculated Financial Metrics
    'order_normalized',
    'cost',
    'profit',

    # Time Features
    'month', 
    'week',
    'day_of_week',
]

# Reorder the DataFrame columns
orders_raw = orders_raw[column_order]

### Filtered Orders df

In [4]:
#No Volume and no Income
# filter=(df['volume']==0) & (df['income']==0) & (df['number_of_orders']==0)
# filter=(df['volume']==0)
filter=(orders_raw['volume']==0) & (orders_raw['income']==0)

orders_filtered = orders_raw[~filter]

# Print the difference in shapes between df and filtered_df
print("Original df shape:", orders_raw.shape)
print("Filtered df shape:", orders_filtered.shape)
print("Difference in rows:", orders_raw.shape[0] - orders_filtered.shape[0])
print("Rows removed (volume=0 AND income=0):", orders_raw.shape[0] - orders_filtered.shape[0])
print("Percentage of rows removed:", f"{((orders_raw.shape[0] - orders_filtered.shape[0]) / orders_raw.shape[0] * 100):.2f}%")

Original df shape: (1014965, 16)
Filtered df shape: (1001824, 16)
Difference in rows: 13141
Rows removed (volume=0 AND income=0): 13141
Percentage of rows removed: 1.29%


### Monthly Clients and Efficiency df

In [5]:
aggregation_rules = {
    'city':               lambda x: x.mode()[0] if not x.mode().empty else None, 
    'channel':            lambda x: x.mode()[0] if not x.mode().empty else None, 
    'promotor_id':        lambda x: x.mode()[0] if not x.mode().empty else None,  
    'order_normalized':   'sum',
    'volume':             'sum',    
    'income':             'sum',
    'cost':               'sum',
    'profit':             'sum',
    'median_ticket':      ['median','min','max','std'],
    'promotor_visits':    'median',
    'promotor_calls':     'median'
}

clients_monthly = orders_filtered.groupby(['client_id', 'month']).agg(aggregation_rules)

# Flatten column MultiIndex into clean names
new_cols = []
for orig_col, agg_func in clients_monthly.columns:
    if agg_func in ('first', '<lambda>'):
        # keep the original name for single-function aggs
        new_cols.append(orig_col)
    else:
        # join field + function for multi-aggs
        new_cols.append(f"{orig_col}_{agg_func}")
clients_monthly.columns = new_cols

clients_monthly.rename(columns={
    'order_normalized_sum':    'total_orders',
    'volume_sum':              'total_volume',
    'income_sum':              'total_income',
    'cost_sum':                'total_cost',
    'profit_sum':              'total_profit',
    'median_ticket_median':    'median_ticket',
    'median_ticket_min':       'median_ticket_min',
    'median_ticket_max':       'median_ticket_max',
    'median_ticket_std':       'median_ticket_std',
    'promotor_visits_median':  'median_promotor_visits',
    'promotor_calls_median':   'median_promotor_calls',
}, inplace=True)


from sklearn.preprocessing import MinMaxScaler

efficiency_monthly = clients_monthly[['total_orders','median_promotor_visits','median_promotor_calls']].copy()
efficiency_monthly['efficiency'] = efficiency_monthly['total_orders'] / efficiency_monthly['median_promotor_visits']

efficiency_monthly['efficiency'] = efficiency_monthly['efficiency'].replace([np.inf, -np.inf], np.inf)

# Step 1: Separate finite and infinite values
finite_mask = np.isfinite(efficiency_monthly['efficiency'])
infinite_mask = ~finite_mask

# Step 2: Scale only finite values
scaler = MinMaxScaler()
efficiency_monthly.loc[finite_mask, 'efficiency_scaled'] = scaler.fit_transform(efficiency_monthly.loc[finite_mask, ['efficiency']])

# Step 3: Assign a fixed high score to infs (e.g., 1.5, or max(finite) + margin)
inf_value = efficiency_monthly['efficiency_scaled'].max() + 0.5
efficiency_monthly.loc[infinite_mask, 'efficiency_scaled'] = inf_value

max_value = efficiency_monthly.loc[finite_mask, 'efficiency'].max()
efficiency_monthly.loc[infinite_mask, 'efficiency'] = max_value

efficiency_monthly.rename(columns={'total_orders': 'frequency'}, inplace=True)
clients_monthly = clients_monthly.merge(efficiency_monthly[['frequency','efficiency','efficiency_scaled']],on='client_id',how='left')

In [6]:
efficiency_monthly

Unnamed: 0_level_0,Unnamed: 1_level_0,frequency,median_promotor_visits,median_promotor_calls,efficiency,efficiency_scaled
client_id,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100006690,1,2,2.0,0.0,1.0,0.050
100006690,2,2,2.0,0.0,1.0,0.050
100006690,3,2,2.0,0.0,1.0,0.050
100006690,4,3,2.0,0.0,1.5,0.075
100006690,5,3,2.0,0.0,1.5,0.075
...,...,...,...,...,...,...
999976985,7,3,1.0,0.0,3.0,0.150
999976985,8,4,1.0,0.0,4.0,0.200
999976985,9,1,1.0,0.0,1.0,0.050
999976985,10,3,1.0,0.0,3.0,0.150


In [7]:
clients_monthly

Unnamed: 0_level_0,city,channel,promotor_id,total_orders,total_volume,total_income,total_cost,total_profit,median_ticket,median_ticket_min,median_ticket_max,median_ticket_std,median_promotor_visits,median_promotor_calls,frequency,efficiency,efficiency_scaled
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
100006690,Madrid,AR,275609911,2,202.5,203.99,80,123.99,101.995,80.1,123.89,30.964206,2.0,0.0,2,1.0,0.050
100006690,Madrid,AR,275609911,2,202.5,203.99,80,123.99,101.995,80.1,123.89,30.964206,2.0,0.0,2,1.0,0.050
100006690,Madrid,AR,275609911,2,202.5,203.99,80,123.99,101.995,80.1,123.89,30.964206,2.0,0.0,2,1.0,0.050
100006690,Madrid,AR,275609911,2,202.5,203.99,80,123.99,101.995,80.1,123.89,30.964206,2.0,0.0,3,1.5,0.075
100006690,Madrid,AR,275609911,2,202.5,203.99,80,123.99,101.995,80.1,123.89,30.964206,2.0,0.0,3,1.5,0.075
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999976985,Barcelona,HR,996963848,2,63.0,203.67,50,153.67,101.835,50.5,153.17,72.598653,1.0,0.0,3,3.0,0.150
999976985,Barcelona,HR,996963848,2,63.0,203.67,50,153.67,101.835,50.5,153.17,72.598653,1.0,0.0,4,4.0,0.200
999976985,Barcelona,HR,996963848,2,63.0,203.67,50,153.67,101.835,50.5,153.17,72.598653,1.0,0.0,1,1.0,0.050
999976985,Barcelona,HR,996963848,2,63.0,203.67,50,153.67,101.835,50.5,153.17,72.598653,1.0,0.0,3,3.0,0.150


### Yearly Clients and Efficiency df

In [8]:
efficiency = efficiency_monthly.groupby('client_id').agg({'frequency': 'median',
                                                          'median_promotor_visits': 'sum',
                                                          'median_promotor_calls': 'sum',
                                                          'efficiency': 'median',
                                                          'efficiency_scaled': 'median'})

efficiency.rename(columns={'median_promotor_visits': 'total_promotor_visits',
                            'median_promotor_calls': 'total_promotor_calls'}, inplace=True)

aggregation_rules = {
    'city':               lambda x: x.mode()[0] if not x.mode().empty else None, 
    'channel':            lambda x: x.mode()[0] if not x.mode().empty else None, 
    'promotor_id':        lambda x: x.mode()[0] if not x.mode().empty else None,  
    'order_normalized':   'sum',
    'volume':             'sum',    
    'income':             'sum',
    'cost':               'sum',
    'profit':             'sum',
    'median_ticket':      ['median','min','max','std'],
    'promotor_visits':    'median',
    'promotor_calls':     'median',
}
clients = orders_filtered.groupby('client_id').agg(aggregation_rules)

# Flatten column MultiIndex into clean names
new_cols = []
for orig_col, agg_func in clients.columns:
    if agg_func in ('first', '<lambda>'):
        # keep the original name for single-function aggs
        new_cols.append(orig_col)
    else:
        # join field + function for multi-aggs
        new_cols.append(f"{orig_col}_{agg_func}")
clients.columns = new_cols

clients = clients.merge(efficiency,on='client_id',how='left')

clients.rename(columns={
    'order_normalized_sum':    'total_orders',
    'volume_sum':              'total_volume',
    'income_sum':              'total_income',
    'cost_sum':                'total_cost',
    'profit_sum':              'total_profit',
    'median_ticket_median':    'median_ticket',
    'promotor_visits_median':  'median_promotor_visits',
    'promotor_calls_median':   'median_promotor_calls',
}, inplace=True)

cols = [
    'city',
    'channel',
    'promotor_id',
    'frequency',
    'efficiency',
    'efficiency_scaled',
    'total_orders',
    'total_volume',
    'total_income',
    'total_cost',
    'total_profit',
    'median_ticket',
    'median_ticket_min',
    'median_ticket_max',
    'median_ticket_std',
    'median_promotor_visits',
    'total_promotor_visits',
    'median_promotor_calls',
    'total_promotor_calls',
]
clients = clients[cols]

low_ticket_threshold = 80

#Defining a function to pick the quadrant label
def assign_quadrant(row):
    high_ticket   = row['median_ticket'] >  low_ticket_threshold
    efficient    = row['efficiency'] > 1
    if   high_ticket and efficient:    return 'HighTicket_Efficient'
    elif not high_ticket and efficient: return 'LowTicket_Efficient'
    elif high_ticket and not efficient: return 'HighTicket_Inefficient'
    else:                              return 'LowTicket_Inefficient'

clients['class'] = clients.apply(assign_quadrant, axis=1)

#Compute per-month averages and gap
clients['zero_visit_flag'] = clients['median_promotor_visits'] == 0
clients['avg_orders_per_month'] = clients['total_orders'] / 12
clients['avg_visits_per_month'] = clients['total_promotor_visits'] / 12
clients['visit_order_gap'] = clients['avg_visits_per_month'] - clients['avg_orders_per_month']

#Direct inefficiency cost
clients['inefficiency_cost'] = clients['visit_order_gap'] * 15

#Profit per visit & opportunity cost
clients['profit_per_visit'] = clients.apply(
    lambda r: r['total_profit'] / r['total_promotor_visits'] if r['total_promotor_visits'] > 0 else 0,
    axis=1
)
clients['opportunity_cost'] = clients['visit_order_gap'] * clients['profit_per_visit']

clients


Unnamed: 0_level_0,city,channel,promotor_id,frequency,efficiency,efficiency_scaled,total_orders,total_volume,total_income,total_cost,...,median_promotor_calls,total_promotor_calls,class,zero_visit_flag,avg_orders_per_month,avg_visits_per_month,visit_order_gap,inefficiency_cost,profit_per_visit,opportunity_cost
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100006690,Madrid,AR,275609911,2.0,1.00,0.0500,22,1658.706,1494.53,880,...,0.0,0.0,LowTicket_Inefficient,False,1.833333,2.000000,0.166667,2.50,25.605417,4.267569
100008050,Barcelona,AR,368568690,1.0,20.00,1.5000,14,3982.000,1905.59,140,...,2.0,20.0,HighTicket_Efficient,True,1.166667,0.000000,-1.166667,-17.50,0.000000,-0.000000
100042162,Barcelona,HR,455263770,1.0,0.25,0.0125,14,1812.850,2243.30,980,...,0.0,0.0,HighTicket_Inefficient,False,1.166667,3.666667,2.500000,37.50,28.711364,71.778409
100046227,Barcelona,AR,454554895,4.0,2.00,0.1000,16,4590.180,2273.12,640,...,2.0,8.0,HighTicket_Efficient,False,1.333333,0.666667,-0.666667,-10.00,204.140000,-136.093333
100125158,Cadiz,HR,483340469,3.0,3.00,0.1500,25,1266.500,2204.24,625,...,0.0,0.0,HighTicket_Efficient,False,2.083333,0.833333,-1.250000,-18.75,157.924000,-197.405000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999934164,Barcelona,HR,480416490,2.0,2.00,0.1000,23,691.000,785.30,575,...,3.0,27.0,LowTicket_Efficient,False,1.916667,0.750000,-1.166667,-17.50,23.366667,-27.261111
999940211,Barcelona,AR,908993212,1.0,20.00,1.5000,3,557.820,260.55,30,...,0.0,0.0,LowTicket_Efficient,True,0.250000,0.000000,-0.250000,-3.75,0.000000,-0.000000
999940578,Madrid,AR,275609911,1.0,0.50,0.0250,13,1101.524,1044.61,520,...,0.0,0.0,LowTicket_Inefficient,False,1.083333,1.833333,0.750000,11.25,23.845909,17.884432
999941988,Madrid,AR,677360818,3.0,1.50,0.0750,36,5415.150,3828.31,1410,...,2.0,24.0,HighTicket_Efficient,False,3.000000,2.000000,-1.000000,-15.00,100.762917,-100.762917


### Export Final dfs

In [9]:
import os

# Create processed data directory if it doesn't exist
processed_dir = '../data/processed'
if not os.path.exists(processed_dir):
    os.makedirs(processed_dir)
    print(f"Created directory: {processed_dir}")

orders_raw.to_csv('../data/processed/orders_raw.csv', index=True)
orders_filtered.to_csv('../data/processed/orders.csv', index=True)
clients.to_csv('../data/processed/clients.csv', index=True)
clients_monthly.to_csv('../data/processed/clients_monthly.csv', index=True)