## IMPORT LIBRARIES

In [2]:
import pandas as pd
from datetime import datetime

## DATA PREPARATION

### Read CSV

In [3]:
dfs=pd.read_csv("dataset/user_seller.csv")
dfb=pd.read_csv("dataset/user_buyer.csv")
dfo=pd.read_csv("dataset/order.csv")
dfod=pd.read_csv("dataset/order_details.csv")

  dfo=pd.read_csv("dataset/order.csv")


### dfo. Initial Order Table

In [4]:
dfo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159133 entries, 0 to 159132
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   po_number            159133 non-null  object 
 1   transaction_date     159133 non-null  object 
 2   order_address_id     159133 non-null  int64  
 3   order_status         159133 non-null  object 
 4   payment_group        159133 non-null  object 
 5   payment_method       159133 non-null  object 
 6   shipping_agency      159133 non-null  object 
 7   shipping_cost        159133 non-null  int64  
 8   total_project_value  159133 non-null  int64  
 9   voucher_val          68399 non-null   float64
 10  voucher_code         5160 non-null    object 
 11  revenue              159133 non-null  int64  
 12  seller_id            159127 non-null  object 
 13  seller_category      159127 non-null  object 
 14  buyer_id             159133 non-null  object 
dtypes: float64(1), in

In [5]:
# Check unique values
field_1 = ['po_number', 'order_address_id', 'seller_id', 'buyer_id']
dfo[field_1].nunique()

po_number           159114
order_address_id    159112
seller_id             6661
buyer_id             12810
dtype: int64

### dfo1. Add column 'is_voucher_used'

In [6]:
dfo1 = dfo.copy()
dfo1['is_voucher_used'] = dfo1['voucher_code'].notnull()
dfo1[dfo1['is_voucher_used'] == False][['voucher_code', 'is_voucher_used']]

Unnamed: 0,voucher_code,is_voucher_used
0,,False
1,,False
2,,False
3,,False
4,,False
...,...,...
159128,,False
159129,,False
159130,,False
159131,,False


In [7]:
dfo1[dfo1['is_voucher_used'] == True][['voucher_code', 'is_voucher_used']]

Unnamed: 0,voucher_code,is_voucher_used
106932,ICIPJUNI,True
106950,ICIPJUNI,True
106951,ICIPJUNI,True
106952,RICIPJUNI-yXr51p73,True
106953,RICIPJUNI-GeWdvv64,True
...,...,...
159041,SALEBRASI25,True
159045,SALEBRASI25,True
159074,BRIBP227,True
159113,BRIBP227,True


In [8]:
# Count the number of null values in 'voucher_code'
null_voucher_count = dfo1['voucher_code'].isnull().sum()

# Count the number of False values in 'is_voucher_used'
false_is_voucher_count = (dfo1['is_voucher_used'] == False).sum()

# Check if the counts are equal
if null_voucher_count == false_is_voucher_count:
    print("The number of null values in 'voucher_code' is equal to the number of False values in 'is_voucher_used'.")
else:
    print("The counts do not match.")

The number of null values in 'voucher_code' is equal to the number of False values in 'is_voucher_used'.


In [9]:
dfo1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159133 entries, 0 to 159132
Data columns (total 16 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   po_number            159133 non-null  object 
 1   transaction_date     159133 non-null  object 
 2   order_address_id     159133 non-null  int64  
 3   order_status         159133 non-null  object 
 4   payment_group        159133 non-null  object 
 5   payment_method       159133 non-null  object 
 6   shipping_agency      159133 non-null  object 
 7   shipping_cost        159133 non-null  int64  
 8   total_project_value  159133 non-null  int64  
 9   voucher_val          68399 non-null   float64
 10  voucher_code         5160 non-null    object 
 11  revenue              159133 non-null  int64  
 12  seller_id            159127 non-null  object 
 13  seller_category      159127 non-null  object 
 14  buyer_id             159133 non-null  object 
 15  is_voucher_used  

### dfo2. Left Inner Join Order with Buyer

In [27]:
dfo2 = dfo1.copy()
dfo2 = pd.merge(dfo1, dfb, left_on='buyer_id', right_on='uid', how='left')

dfo2.rename(columns={'province': 'buyer_prov', 'city': 'buyer_city'}, inplace=True)
dfo2 = dfo2.drop(columns=['flag'])

dfo2.info()                 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159233 entries, 0 to 159232
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   po_number            159233 non-null  object 
 1   transaction_date     159233 non-null  object 
 2   order_address_id     159233 non-null  int64  
 3   order_status         159233 non-null  object 
 4   payment_group        159233 non-null  object 
 5   payment_method       159233 non-null  object 
 6   shipping_agency      159233 non-null  object 
 7   shipping_cost        159233 non-null  int64  
 8   total_project_value  159233 non-null  int64  
 9   voucher_val          68426 non-null   float64
 10  voucher_code         5160 non-null    object 
 11  revenue              159233 non-null  int64  
 12  seller_id            159227 non-null  object 
 13  seller_category      159227 non-null  object 
 14  buyer_id             159233 non-null  object 
 15  is_voucher_used  

In [28]:
dfo2['seller_category'].value_counts()

seller_category
Menengah    62198
Kecil       51528
Mikro       44341
Besar        1094
Swasta         66
Name: count, dtype: int64

### dfos. Group by seller_id (Variable Selection)

In [26]:
# Convert 'transaction_date' column to datetime
dfo2['transaction_date'] = pd.to_datetime(dfo2['transaction_date'])

today = pd.to_datetime('today')
today = today.replace(year=2024, month=1, day=1)

In [20]:
# Define aggregation functions for each column
aggregations = {
    'transaction_date': 'max',  # latest transaction date
    'order_address_id': 'count',  # order frequency
    'total_project_value': 'sum',  # GMV
    'revenue': 'sum',  # revenue
    'is_voucher_used': 'sum',  # number of voucher used
    'voucher_val': 'sum',  # sum of voucher value
    'buyer_id': 'nunique',  # count unique buyer_id
    'buyer_prov': 'nunique',  # count unique buyer_province
    'buyer_city': 'nunique',  # count unique buyer_city
    'seller_category': 'min'
}

# Group by 'seller_id' and apply the aggregation functions
dfos = dfo2.groupby('seller_id').agg(aggregations)

# Calculate recency_day
dfos['recency_day'] = (today - dfos['transaction_date']).dt.days

# Calculate aov
dfos['aov'] = dfos['total_project_value'] / dfos['order_address_id']

# Rearrange columns as per your specified order
column_order = ['transaction_date', 'recency_day', 'order_address_id', 'total_project_value', 
                'aov', 'revenue', 'is_voucher_used', 'voucher_val', 'buyer_id', 
                'buyer_prov', 'buyer_city', 'seller_category']

dfos = dfos[column_order]

# Rename the columns
dfos.columns = ['last_transaction', 'recency_day', 'order_freq', 'gmv', 
                'aov', 'revenue', 'voucher_used', 'voucher_val', 'buyer_count', 
                'num_of_buyer_province', 'num_of_buyer_city', 'seller_category']

# Reset index to make 'seller_id' a regular column
dfos.reset_index(inplace=True)

dfos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6661 entries, 0 to 6660
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   seller_id              6661 non-null   object        
 1   last_transaction       6661 non-null   datetime64[ns]
 2   recency_day            6661 non-null   int64         
 3   order_freq             6661 non-null   int64         
 4   gmv                    6661 non-null   int64         
 5   aov                    6661 non-null   float64       
 6   revenue                6661 non-null   int64         
 7   voucher_used           6661 non-null   int64         
 8   voucher_val            6661 non-null   float64       
 9   buyer_count            6661 non-null   int64         
 10  num_of_buyer_province  6661 non-null   int64         
 11  num_of_buyer_city      6661 non-null   int64         
 12  seller_category        6661 non-null   object        
dtypes: 

In [21]:
dfos.head(25)

Unnamed: 0,seller_id,last_transaction,recency_day,order_freq,gmv,aov,revenue,voucher_used,voucher_val,buyer_count,num_of_buyer_province,num_of_buyer_city,seller_category
0,631a50645b9755003d24d6b9,2023-12-08,24,3,205201171,68400390.0,1539008,0,0.0,1,1,1,Kecil
1,631a50655b9755003d24d78a,2023-12-13,19,5,1651459110,330291800.0,8257295,0,0.0,1,1,1,Kecil
2,631a50655b9755003d24d7fe,2023-10-26,67,1,105450000,105450000.0,790875,0,0.0,1,1,1,Kecil
3,631a50675b9755003d24da41,2023-12-22,10,3,275646350,91882120.0,2067346,0,0.0,1,1,1,Menengah
4,631a50685b9755003d24dd24,2023-12-27,5,1,36874200,36874200.0,276556,0,0.0,1,1,1,Menengah
5,631a50695b9755003d24ddb4,2023-11-15,47,2,125058150,62529080.0,937935,0,0.0,2,2,2,Kecil
6,631a506a5b9755003d24dfdb,2023-04-19,257,1,68376000,68376000.0,512820,0,0.0,1,1,1,Kecil
7,631a506a5b9755003d24e040,2023-11-03,59,1,2220000,2220000.0,16650,0,0.0,1,1,1,Menengah
8,631a506a5b9755003d24e06b,2023-12-05,27,5,394444200,78888840.0,2958330,0,0.0,2,1,1,Kecil
9,631a506a5b9755003d24e06d,2023-12-28,4,34,2500908662,73556140.0,18756807,0,0.0,14,10,12,Kecil


In [29]:
dfos['seller_category'].value_counts()

seller_category
Mikro       2893
Kecil       2116
Menengah    1562
Besar         58
Swasta        32
Name: count, dtype: int64

### dfos_c. Save to pickles

In [30]:
# Create a copy of dfos with 'seller_id', 'last_transaction', & 'seller_category' columns removed
dfos_c = dfos.drop(columns=['seller_id', 'last_transaction', 'seller_category'])

# Save dfos_c as a pickle file
dfos_c.to_pickle('pickles/dfos_c.pkl')

# Optionally, you can also save dfos with the 'seller_id' column intact
dfos.to_pickle('pickles/dfos.pkl')