### Load Dataset

In [10]:
import pandas as pd
import numpy as np

## Takes a few seconds to load
file_path = '../data/raw/Orders_Master_Data(in).xlsx'
raw_data = pd.read_excel(file_path) 

### Data Cleaning & Feature Engineering

In [11]:
import numpy as np

df = raw_data.copy()
df.columns = df.columns.str.replace(' ', '_')
df.columns = df.columns.str.lower()
df = df.drop_duplicates()

df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
df.set_index('date', inplace=True)
df.rename(columns={"median_ticket_(‚ç¨)": "median_ticket",
                   "prom_contacts_month":"promotor_visits",
                   "tel_contacts_month":"promotor_calls"},
                   inplace=True)

df['city'] =  df['city'].apply(lambda x: 'Cadiz' if x=='C√°diz' else x)
df['city'] =  df['city'].apply(lambda x: 'Castellon' if x=='Castell√≥n' else x)
df['city'] =  df['city'].apply(lambda x: 'Cordoba' if x=='C√≥rdoba' else x)

# Time Features
df['day_of_week'] = df.index.dayofweek
df['month'] = df.index.month
df['week'] = df.index.isocalendar().week

# Cost Calculation
logistics_cost = 10 #EUR per order
visit_cost = 15 #EUR per visit

df['order_normalized'] = np.where(df['number_of_orders'] > 0, 1, 0) 
df['order_cost'] = df['order_normalized'] * logistics_cost + df['promotor_visits'] * visit_cost
df['order_profit'] = df['income'] - df['order_cost']
df['efficiency_orders'] = np.where(df['promotor_visits'] > 0, 
                                  df['number_of_orders'] / df['promotor_visits'], 
                                  1) 
df['efficiency_profit'] = np.where(df['promotor_visits'] > 0, 
                                  df['order_profit'] / df['promotor_visits'], 
                                  1) 
# Set the low ticket threshold to 80€ as per project instructions.
low_ticket_threshold = 80

# Create boolean flags for performance categories.
df['low_ticket'] = df['median_ticket'] <= low_ticket_threshold
df['inefficient'] = df['efficiency_orders'] < 1
df['at_risk'] = df['low_ticket'] & df['inefficient']

### Frecuency Class Calculation (per client)
monthly_orders = df.groupby(['client_id', 'month'])['number_of_orders'].sum().reset_index(name='monthly_orders')
monthly_orders.sort_values(by=['month','monthly_orders'],ascending=[True,False],inplace=True)
monthly_orders.reset_index(drop=True,inplace=True)

frequency = monthly_orders.groupby('client_id')['monthly_orders'].median().reset_index()
frequency.rename(columns={'monthly_orders': 'client_frequency'}, inplace=True)
frequency.sort_values(by='client_frequency', ascending=False, inplace=True)

In [12]:
frequency.head()

Unnamed: 0,client_id,client_frequency
22371,577029300,62.5
12460,365042657,22.5
30183,744372710,20.5
40523,966347937,20.5
6414,238243563,18.5


### Final df

In [13]:
# Reorganize columns in a logical order
column_order = [
    # Client & Geographic Information
    'client_id',
    'city', 
    'channel',
    'promotor_id',
    
    # Core Business Metrics
    'number_of_orders',
    'volume',
    'income',
    'median_ticket',
    
    # Contact & Visit Metrics
    'promotor_visits',
    'promotor_calls',
    
    # Calculated Financial Metrics
    'order_normalized',
    'order_cost',
    'order_profit',
    'efficiency_orders',
    'efficiency_profit',
    
    # Performance Flags
    'low_ticket',
    'inefficient', 

    # Time Features
    'month', 
    'week',
    'day_of_week',
]

# Reorder the DataFrame columns
df = df[column_order]
df


Unnamed: 0_level_0,client_id,city,channel,promotor_id,number_of_orders,volume,income,median_ticket,promotor_visits,promotor_calls,order_normalized,order_cost,order_profit,efficiency_orders,efficiency_profit,low_ticket,inefficient,month,week,day_of_week
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2024-01-01,398150871,Alicante,AR,729030652,1,5.940,0.00,0.00,0,0,1,10,-10.00,1.00,1.0000,True,False,1,1,0
2024-01-01,410234355,Alicante,HR,551409294,1,48.000,21.02,21.02,4,0,1,70,-48.98,0.25,-12.2450,True,True,1,1,0
2024-01-02,123463493,Alicante,AR,551409294,1,125.250,92.57,92.57,1,0,1,25,67.57,1.00,67.5700,False,False,1,1,1
2024-01-02,124527399,Alicante,AR,729030652,1,83.000,60.94,60.94,4,0,1,70,-9.06,0.25,-2.2650,True,True,1,1,1
2024-01-02,130100821,Alicante,AR,729030652,1,768.000,244.33,244.33,1,3,1,25,219.33,1.00,219.3300,False,False,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-31,974505828,Valencia,HR,249555220,1,120.000,119.20,119.20,4,0,1,70,49.20,0.25,12.3000,False,True,12,1,1
2024-12-31,976757748,Valencia,HR,327176535,1,79.963,255.49,255.49,4,0,1,70,185.49,0.25,46.3725,False,True,12,1,1
2024-12-31,977650762,Valencia,HR,937854151,1,85.890,280.38,280.38,0,1,1,10,270.38,1.00,1.0000,False,False,12,1,1
2024-12-31,982745366,Valencia,HR,52875287,1,178.500,280.24,280.24,0,4,1,10,270.24,1.00,1.0000,False,False,12,1,1


In [14]:
#No Volume and no Income
# filter=(df['volume']==0) & (df['income']==0) & (df['number_of_orders']==0)
# filter=(df['volume']==0)
filter=(df['volume']==0) & (df['income']==0)

filtered_df = df[~filter]

# Print the difference in shapes between df and filtered_df
print("Original df shape:", df.shape)
print("Filtered df shape:", filtered_df.shape)
print("Difference in rows:", df.shape[0] - filtered_df.shape[0])
print("Rows removed (volume=0 AND income=0):", df.shape[0] - filtered_df.shape[0])
print("Percentage of rows removed:", f"{((df.shape[0] - filtered_df.shape[0]) / df.shape[0] * 100):.2f}%")

Original df shape: (1014965, 20)
Filtered df shape: (1001824, 20)
Difference in rows: 13141
Rows removed (volume=0 AND income=0): 13141
Percentage of rows removed: 1.29%


### Client Summary

In [17]:
# 1) Defining aggregation rules
aggregation_rules = {
    'income':             'sum',
    'volume':             'sum',
    'number_of_orders':   'sum',
    'median_ticket':      ['median','min','max','std'],
    'promotor_visits':    'median',
    'promotor_calls':     'median',
    'order_profit':       'sum',
    'efficiency_orders':  'median',
    'efficiency_profit':  'median',
    'promotor_id':        lambda x: x.mode()[0] if not x.mode().empty else None,
    'city':               'first',
    'channel':            'first'
}

client_summary = df.groupby('client_id').agg(aggregation_rules)

# Flatten column MultiIndex into clean names
new_cols = []
for orig_col, agg_func in client_summary.columns:
    if agg_func in ('first', '<lambda>'):
        # keep the original name for single-function aggs
        new_cols.append(orig_col)
    else:
        # join field + function for multi-aggs
        new_cols.append(f"{orig_col}_{agg_func}")
client_summary.columns = new_cols

# 2) Rename=ing everything to your final schema
client_summary.rename(columns={
    # totals
    'income_sum':              'total_income',
    'volume_sum':              'total_volume',
    'number_of_orders_sum':    'total_orders',
    #net profit
    'order_profit_sum':         'total_profit',
    # median_ticket variants
    'median_ticket_median':    'median_ticket_year',
    'median_ticket_min':       'median_ticket_min',
    'median_ticket_max':       'median_ticket_max',
    'median_ticket_std':       'median_ticket_std',
    # promoter touches
    'promotor_visits_median':     'median_promotor_visits',
    'promotor_calls_median':      'median_promotor_calls',
    # efficiency medians
    'efficiency_orders_median':'median_efficiency_orders',
    'efficiency_profit_median':'median_efficiency_profit',
    # no rename needed for 'city', 'channel', 'promotor_id'
}, inplace=True)

# 3) Merging in the frequency table and re-index by client_id
client_summary = (
    client_summary
    .merge(frequency, left_index=True, right_on='client_id', how='left')
    .set_index('client_id')
)

# 4) Select & order your final columns
cols = [
    'channel',
    'city',
    'promotor_id',
    'client_frequency',
    'total_orders',
    'total_volume',
    'total_income',
    'median_ticket_year',
    'median_ticket_min',
    'median_ticket_max',
    'median_ticket_std',
    'median_promotor_visits',
    'median_promotor_calls',
    'total_profit',
    'median_efficiency_orders',
    'median_efficiency_profit',
]
client_summary = client_summary[cols]

In [20]:
#Defining a function to pick the quadrant label
def assign_quadrant(row):
    high_value   = row['median_ticket_year'] >  80
    efficient    = row['median_efficiency_orders'] > 1
    if   high_value and efficient:    return 'HighTicket_Efficient'
    elif not high_value and efficient: return 'LowTicket_Efficient'
    elif high_value and not efficient: return 'HighTicket_Inefficient'
    else:                              return 'LowTicket_Inefficient'

client_summary['quadrant'] = client_summary.apply(assign_quadrant, axis=1)

#Compute per-month averages and gap
client_summary['avg_orders_per_month'] = client_summary['total_orders'] / 12
client_summary['avg_visits_per_month'] = client_summary['median_promotor_visits'] / 12
client_summary['visit_order_gap'] = client_summary['avg_visits_per_month'] - client_summary['avg_orders_per_month']

#Direct inefficiency cost
client_summary['inefficiency_cost'] = client_summary['visit_order_gap'] * 15

#Profit per visit & opportunity cost
client_summary['profit_per_visit'] = client_summary.apply(
    lambda r: r['total_profit'] / r['median_promotor_visits'] if r['median_promotor_visits'] > 0 else 0,
    axis=1
)
client_summary['opportunity_cost'] = client_summary['visit_order_gap'] * client_summary['profit_per_visit']

### Save final dfs

In [21]:
import os

# Create processed data directory if it doesn't exist
processed_dir = '../data/processed'
if not os.path.exists(processed_dir):
    os.makedirs(processed_dir)
    print(f"Created directory: {processed_dir}")

df.to_csv('../data/processed/df_all_rows.csv', index=True)
filtered_df.to_csv('../data/processed/df_clean.csv', index=True)
client_summary.to_csv('../data/processed/client_summary.csv', index=True)

Created directory: ../data/processed
