### Load Dataset

In [2]:
import pandas as pd
import numpy as np

## Takes a few seconds to load
file_path = '../data/Orders_Master_Data(in).xlsx'
raw_data = pd.read_excel(file_path) 

### Data Cleaning & Feature Engineering

In [3]:
import numpy as np

df = raw_data.copy()
df.columns = df.columns.str.replace(' ', '_')
df.columns = df.columns.str.lower()
df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
df.set_index('date', inplace=True)
df.rename(columns={"median_ticket_(‚ç¨)": "median_ticket",
                   "prom_contacts_month":"promotor_visits",
                   "tel_contacts_month":"promotor_calls"},
                   inplace=True)

df['city'] =  df['city'].apply(lambda x: 'Cadiz' if x=='C√°diz' else x)
df['city'] =  df['city'].apply(lambda x: 'Castellon' if x=='Castell√≥n' else x)
df['city'] =  df['city'].apply(lambda x: 'Cordoba' if x=='C√≥rdoba' else x)

# Time Features
df['day_of_week'] = df.index.dayofweek
df['month'] = df.index.month
df['week'] = df.index.isocalendar().week

# Cost Calculation
logistics_cost = 10 #EUR per order
visit_cost = 15 #EUR per visit

df['order_normalized'] = np.where(df['number_of_orders'] > 0, 1, 0) 
df['order_cost'] = df['order_normalized'] * logistics_cost + df['promotor_visits'] * visit_cost
df['order_profit'] = df['income'] - df['order_cost']
df['efficiency_orders'] = np.where(df['promotor_visits'] > 0, 
                                  df['number_of_orders'] / df['promotor_visits'], 
                                  1) 
df['efficiency_profit'] = np.where(df['promotor_visits'] > 0, 
                                  df['order_profit'] / df['promotor_visits'], 
                                  1) 
# Set the low ticket threshold to 80€ as per project instructions.
low_ticket_threshold = 80

# Create boolean flags for performance categories.
df['low_ticket'] = df['median_ticket'] <= low_ticket_threshold
df['inefficient'] = df['efficiency_orders'] < 1
df['at_risk'] = df['low_ticket'] & df['inefficient']

### Frecuency Class Calculation (per client)
monthly_orders = df.groupby(['client_id', 'month'])['number_of_orders'].sum().reset_index(name='monthly_orders')
monthly_orders.sort_values(by=['month','monthly_orders'],ascending=[True,False],inplace=True)
monthly_orders.reset_index(drop=True,inplace=True)

frequency = monthly_orders.groupby('client_id')['monthly_orders'].median().reset_index()
frequency.rename(columns={'monthly_orders': 'client_frequency'}, inplace=True)
frequency.sort_values(by='client_frequency', ascending=False, inplace=True)

In [4]:
frequency.head()

Unnamed: 0,client_id,client_frequency
22371,577029300,62.5
12460,365042657,22.5
30183,744372710,20.5
40523,966347937,20.5
6414,238243563,18.5


### Final df

In [5]:
# Reorganize columns in a logical order
column_order = [
    # Client & Geographic Information
    'client_id',
    'city', 
    'channel',
    'promotor_id',
    
    # Core Business Metrics
    'number_of_orders',
    'volume',
    'income',
    'median_ticket',
    
    # Contact & Visit Metrics
    'promotor_visits',
    'promotor_calls',
    
    # Calculated Financial Metrics
    'order_normalized',
    'order_cost',
    'order_profit',
    'efficiency_orders',
    'efficiency_profit',
    
    # Performance Flags
    'low_ticket',
    'inefficient', 

    # Time Features
    'month', 
    'week',
    'day_of_week',
]

# Reorder the DataFrame columns
df = df[column_order]
df


Unnamed: 0_level_0,client_id,city,channel,promotor_id,number_of_orders,volume,income,median_ticket,promotor_visits,promotor_calls,order_normalized,order_cost,order_profit,efficiency_orders,efficiency_profit,low_ticket,inefficient,month,week,day_of_week
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2024-01-01,398150871,Alicante,AR,729030652,1,5.940,0.00,0.000,0,0,1,10,-10.00,1.00,1.0000,True,False,1,1,0
2024-01-01,410234355,Alicante,HR,551409294,1,48.000,21.02,21.020,4,0,1,70,-48.98,0.25,-12.2450,True,True,1,1,0
2024-01-02,123463493,Alicante,AR,551409294,1,125.250,92.57,92.570,1,0,1,25,67.57,1.00,67.5700,False,False,1,1,1
2024-01-02,124527399,Alicante,AR,729030652,1,83.000,60.94,60.940,4,0,1,70,-9.06,0.25,-2.2650,True,True,1,1,1
2024-01-02,130100821,Alicante,AR,729030652,1,768.000,244.33,244.330,1,3,1,25,219.33,1.00,219.3300,False,False,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-31,835992380,Tarragona,HR,513990441,1,23.180,352.33,352.330,1,0,1,25,327.33,1.00,327.3300,False,False,12,1,1
2024-12-31,908319695,Tarragona,HR,513990441,1,128.455,147.37,147.370,1,0,1,25,122.37,1.00,122.3700,False,False,12,1,1
2024-12-31,927121528,Tarragona,HR,513990441,1,2.590,72.18,72.180,4,0,1,70,2.18,0.25,0.5450,True,True,12,1,1
2024-12-31,974188824,Tarragona,HR,513990441,2,32.000,187.17,93.585,4,0,1,70,117.17,0.50,29.2925,False,True,12,1,1


In [6]:
#No Volume and no Income
# filter=(df['volume']==0) & (df['income']==0) & (df['number_of_orders']==0)
# filter=(df['volume']==0)
filter=(df['volume']==0) & (df['income']==0)

filtered_df = df[~filter]

# Print the difference in shapes between df and filtered_df
print("Original df shape:", df.shape)
print("Filtered df shape:", filtered_df.shape)
print("Difference in rows:", df.shape[0] - filtered_df.shape[0])
print("Rows removed (volume=0 AND income=0):", df.shape[0] - filtered_df.shape[0])
print("Percentage of rows removed:", f"{((df.shape[0] - filtered_df.shape[0]) / df.shape[0] * 100):.2f}%")

Original df shape: (1035735, 20)
Filtered df shape: (1022404, 20)
Difference in rows: 13331
Rows removed (volume=0 AND income=0): 13331
Percentage of rows removed: 1.29%


### Client Summary

In [7]:
aggregation_rules = {
    'income': 'sum',
    'volume': 'sum',
    'number_of_orders': 'sum',
    'median_ticket': 'median',
    'promotor_visits': 'sum',
    'promotor_calls': 'sum',
    'promotor_id': lambda x: x.mode()[0] if not x.mode().empty else None,
    'city': 'first',
    'channel': 'first'
}

client_summary = df.groupby('client_id').agg(aggregation_rules)
client_summary.rename(columns={
    'income': 'total_income',
    'volume': 'total_volume',
    'number_of_orders': 'total_orders',
    'median_ticket': 'median_ticket_year',
    'promotor_visits': 'total_promotor_visits',
    'promotor_calls': 'total_promotor_calls'
}, inplace=True)
client_summary = client_summary[['channel','city','promotor_id','total_orders','total_volume','total_income','median_ticket_year','total_promotor_visits','total_promotor_calls']]
client_summary.sort_values(by='total_orders',ascending=False)

Unnamed: 0_level_0,channel,city,promotor_id,total_orders,total_volume,total_income,median_ticket_year,total_promotor_visits,total_promotor_calls
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
577029300,HR,Valencia,376164172,761,82318.456,87010.07,131.25,0,560
365042657,HR,Barcelona,317564580,263,50193.076,56981.31,215.47,4600,0
966347937,HR,Malaga,385906115,251,372234.720,107396.38,437.74,640,0
744372710,AR,Malaga,385906115,233,548260.980,165350.28,673.92,0,0
240393159,HR,Madrid,526597557,220,13654.000,23821.35,106.39,0,0
...,...,...,...,...,...,...,...,...,...
340899654,AR,Burgos,466760699,0,0.000,0.00,0.00,0,0
110265706,HR,Barcelona,651739976,0,0.000,0.00,0.00,0,0
692982510,HR,Barcelona,866818549,0,0.000,0.00,0.00,0,0
620525109,HR,Barcelona,900389803,0,0.000,0.00,0.00,0,0


In [8]:
# Merge client_summary with frequency on client_id
client_summary = client_summary.merge(frequency, left_index=True, right_on='client_id', how='left')
client_summary.set_index('client_id', inplace=True)
# Display the result
client_summary.head()


Unnamed: 0_level_0,channel,city,promotor_id,total_orders,total_volume,total_income,median_ticket_year,total_promotor_visits,total_promotor_calls,client_frequency
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100006690,AR,Madrid,275609911,22,1658.706,1494.53,60.99,44,0,2.0
100008050,AR,Barcelona,368568690,14,3982.0,1905.59,132.37,0,28,1.0
100042162,HR,Barcelona,455263770,18,1812.85,2243.3,128.125,56,0,2.0
100046227,AR,Barcelona,454554895,16,4590.18,2273.12,132.64,32,32,4.0
100125158,HR,Cadiz,483340469,26,1266.5,2204.24,67.255,25,0,3.0


### Save final df

In [9]:
# df.to_csv('data/df_all_rows.csv', index=True)
# filtered_df.to_csv('data/df_clean.csv', index=True)
# client_summary.to_csv('data/client_summary.csv', index=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1035735 entries, 2024-01-01 to 2024-12-31
Data columns (total 20 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   client_id          1035735 non-null  int64  
 1   city               1035735 non-null  object 
 2   channel            1035735 non-null  object 
 3   promotor_id        1035735 non-null  int64  
 4   number_of_orders   1035735 non-null  int64  
 5   volume             1035735 non-null  float64
 6   income             1035735 non-null  float64
 7   median_ticket      1035735 non-null  float64
 8   promotor_visits    1035735 non-null  int64  
 9   promotor_calls     1035735 non-null  int64  
 10  order_normalized   1035735 non-null  int64  
 11  order_cost         1035735 non-null  int64  
 12  order_profit       1035735 non-null  float64
 13  efficiency_orders  1035735 non-null  float64
 14  efficiency_profit  1035735 non-null  float64
 15  low_ticket       