The aim of this notebook is to take the raw data and transform into a clean dataset set for modelling.

In [1]:
import pandas as pd

In [2]:
order_numbers = pd.read_excel('data/data_task.xlsx', sheet_name='order_numbers')
transaction_data = pd.read_excel('data/data_task.xlsx', sheet_name='transaction_data')
reported_data = pd.read_excel('data/data_task.xlsx', sheet_name='reported_data')

# Clean the order numbers df
    - order df by date
    - ensure the order numbers value increase (create boolean column) 

In [3]:
order_numbers.shape

(856, 2)

In [4]:
order_numbers.head()

Unnamed: 0,date,order_number
0,2018-01-07,33841906
1,2018-01-22,34008921
2,2018-01-25,34397468
3,2018-02-06,34434432
4,2018-02-08,34579365


In [5]:
order_numbers.dtypes

date            datetime64[ns]
order_number             int64
dtype: object

In [6]:
def validate_order(df, date_col, order_col):
    # Ensure DataFrame is sorted by date
    df = df.sort_values(by=date_col).reset_index(drop=True)

    # Create Boolean column to check if order number is higher than the previous row
    df["order_increasing"] = df[order_col].diff() > 0  

    # Set the first row to True (as there's no previous row to compare)
    df.loc[0, "order_increasing"] = True 

    return df

In [7]:
orders1 = validate_order(order_numbers, 'date', 'order_number')
orders1

Unnamed: 0,date,order_number,order_increasing
0,2018-01-07,33841906,True
1,2018-01-22,34008921,True
2,2018-01-25,34397468,True
3,2018-02-06,34434432,True
4,2018-02-08,34579365,True
...,...,...,...
851,2022-12-26,89053562,True
852,2022-12-27,89078365,True
853,2022-12-28,89109007,True
854,2022-12-29,89139373,True


In [8]:
orders1.value_counts('order_increasing')

order_increasing
True     638
False    218
Name: count, dtype: int64

In [9]:
orders2 = orders1[orders1['order_increasing'] == True]
orders2.value_counts('order_increasing')

order_increasing
True    638
Name: count, dtype: int64

In [10]:
orders2.head()

Unnamed: 0,date,order_number,order_increasing
0,2018-01-07,33841906,True
1,2018-01-22,34008921,True
2,2018-01-25,34397468,True
3,2018-02-06,34434432,True
4,2018-02-08,34579365,True


In [11]:
orders3 = orders2.drop(columns=['order_increasing'])
orders3.head()

Unnamed: 0,date,order_number
0,2018-01-07,33841906
1,2018-01-22,34008921
2,2018-01-25,34397468
3,2018-02-06,34434432
4,2018-02-08,34579365


In [12]:
# Resample to quarterly frequency and aggregate using mean (or sum if necessary)
# Set 'date' as the index for resampling
orders3 = orders3.set_index("date")

orders_quarterly = orders3.resample("Q").mean().reset_index()

orders_quarterly['quarter'] = orders_quarterly['date'].dt.to_period('Q').astype(str)

orders_quarterly = orders_quarterly.drop(columns=['date'])

orders_quarterly.head()

  orders_quarterly = orders3.resample("Q").mean().reset_index()


Unnamed: 0,order_number,quarter
0,35029340.0,2018Q1
1,36884330.0,2018Q2
2,38704010.0,2018Q3
3,40960090.0,2018Q4
4,43275190.0,2019Q1


In [13]:
orders_quarterly.shape

(20, 2)

# Transaction data

In [14]:
transaction_data.shape

(1826, 4)

In [15]:
transaction_data.dtypes

date                         datetime64[ns]
total_spend_index                   float64
gross_orders_index                  float64
weekly_active_users_index           float64
dtype: object

In [16]:
transaction_data.head()

Unnamed: 0,date,total_spend_index,gross_orders_index,weekly_active_users_index
0,2018-01-01,0.052537,0.158983,1.563652
1,2018-01-02,0.839176,1.430843,1.628314
2,2018-01-03,0.182855,0.476948,1.63374
3,2018-01-04,2.364675,1.90779,1.657706
4,2018-01-05,0.687484,1.27186,1.679863


In [17]:
def date_sorting(df, date_col):

    # Ensure DataFrame is sorted by date
    df = df.sort_values(by=date_col).reset_index(drop=True)

    return df

In [18]:
transaction1 = date_sorting(transaction_data, 'date')
transaction1.head()

Unnamed: 0,date,total_spend_index,gross_orders_index,weekly_active_users_index
0,2018-01-01,0.052537,0.158983,1.563652
1,2018-01-02,0.839176,1.430843,1.628314
2,2018-01-03,0.182855,0.476948,1.63374
3,2018-01-04,2.364675,1.90779,1.657706
4,2018-01-05,0.687484,1.27186,1.679863


In [19]:

# Set 'date' as the index for resampling
transaction1 = transaction1.set_index("date")

# Resample to quarterly frequency and aggregate using mean (or sum if necessary)
transaction_quarterly = transaction1.resample("Q").mean().reset_index()

transaction_quarterly['quarter'] = transaction_quarterly['date'].dt.to_period('Q').astype(str)

transaction_quarterly = transaction_quarterly.drop(columns=['date'])

transaction_quarterly.head()

  transaction_quarterly = transaction1.resample("Q").mean().reset_index()


Unnamed: 0,total_spend_index,gross_orders_index,weekly_active_users_index,quarter
0,1.020328,1.231231,2.21238,2018Q1
1,3.907097,3.719492,5.080265,2018Q2
2,9.063225,9.020529,12.989319,2018Q3
3,15.179657,16.395936,19.686191,2018Q4
4,13.63627,14.917859,25.002324,2019Q1


In [20]:
transaction_quarterly.shape

(20, 4)

# join the order number data onto the transaction data, (left join to keep all the transaction data)

In [21]:
joined_df = pd.merge(transaction_quarterly, orders_quarterly, how = 'left', on = 'quarter')
joined_df.head(20)

Unnamed: 0,total_spend_index,gross_orders_index,weekly_active_users_index,quarter,order_number
0,1.020328,1.231231,2.21238,2018Q1,35029340.0
1,3.907097,3.719492,5.080265,2018Q2,36884330.0
2,9.063225,9.020529,12.989319,2018Q3,38704010.0
3,15.179657,16.395936,19.686191,2018Q4,40960090.0
4,13.63627,14.917859,25.002324,2019Q1,43275190.0
5,23.909396,23.957442,32.189396,2019Q2,45503790.0
6,27.290086,28.153729,41.530273,2019Q3,47747900.0
7,45.263284,46.241446,54.220241,2019Q4,50498200.0
8,42.695804,51.010675,73.601916,2020Q1,53042860.0
9,82.694949,95.808802,99.068715,2020Q2,56199640.0


# Feature Engineering

In [22]:
# create a spend per user column
joined_df["spend_per_user"] = (
    joined_df["total_spend_index"] / joined_df["weekly_active_users_index"]
)

In [26]:
joined_df["prev_spend_per_user"] = joined_df["spend_per_user"].shift(1)
joined_df["prev_order_volume"] = joined_df["order_number"].shift(1)

In [27]:
joined_df.head()

Unnamed: 0,total_spend_index,gross_orders_index,weekly_active_users_index,quarter,order_number,spend_per_user,prev_spend_per_user,prev_order_volume
0,1.020328,1.231231,2.21238,2018Q1,35029340.0,0.46119,,
1,3.907097,3.719492,5.080265,2018Q2,36884330.0,0.769073,0.46119,35029340.0
2,9.063225,9.020529,12.989319,2018Q3,38704010.0,0.697744,0.769073,36884330.0
3,15.179657,16.395936,19.686191,2018Q4,40960090.0,0.771081,0.697744,38704010.0
4,13.63627,14.917859,25.002324,2019Q1,43275190.0,0.5454,0.771081,40960090.0
