In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [None]:
# Load the data
data = pd.read_csv("../src/data/data.csv")  # Replace with your file path



In [7]:
# Convert TransactionStartTime to datetime
data['TransactionStartTime'] = pd.to_datetime(data['TransactionStartTime'])


In [8]:
# Feature 1: RFMS Features (Recency, Frequency, Monetary, Stability)
# Group by CustomerId to calculate RFMS metrics
rfms_data = data.groupby('CustomerId').agg({
    'TransactionStartTime': lambda x: (data['TransactionStartTime'].max() - x.max()).days,  # Recency
    'TransactionId': 'count',  # Frequency
    'Amount': ['sum', 'std']  # Monetary (sum of Amount) and Stability (std of Amount)
}).reset_index()



In [9]:
# Flatten the multi-level column names
rfms_data.columns = ['CustomerId', 'Recency', 'Frequency', 'Monetary', 'Stability']

# Merge RFMS features back into the original dataset
data = data.merge(rfms_data, on='CustomerId', how='left')


In [10]:
# Feature 2: Time-Based Features
data['TransactionHour'] = data['TransactionStartTime'].dt.hour
data['TransactionDay'] = data['TransactionStartTime'].dt.day
data['TransactionMonth'] = data['TransactionStartTime'].dt.month
data['TransactionYear'] = data['TransactionStartTime'].dt.year

In [11]:
# Feature 3: Aggregate Features
# Total Transaction Amount per Customer
data['TotalTransactionAmount'] = data.groupby('CustomerId')['Amount'].transform('sum')

# Average Transaction Amount per Customer
data['AverageTransactionAmount'] = data.groupby('CustomerId')['Amount'].transform('mean')

# Transaction Count per Customer
data['TransactionCount'] = data.groupby('CustomerId')['TransactionId'].transform('count')

In [12]:
# Feature 4: Handle Missing Values
# Check for missing values
print("Missing Values Before Imputation:")
print(data.isnull().sum())

# Impute missing values for numerical features
numerical_features = ['Amount', 'Value', 'Recency', 'Frequency', 'Monetary', 'Stability']
imputer = SimpleImputer(strategy='mean')
data[numerical_features] = imputer.fit_transform(data[numerical_features])

Missing Values Before Imputation:
TransactionId                 0
BatchId                       0
AccountId                     0
SubscriptionId                0
CustomerId                    0
CurrencyCode                  0
CountryCode                   0
ProviderId                    0
ProductId                     0
ProductCategory               0
ChannelId                     0
Amount                        0
Value                         0
TransactionStartTime          0
PricingStrategy               0
FraudResult                   0
Recency                       0
Frequency                     0
Monetary                      0
Stability                   712
TransactionHour               0
TransactionDay                0
TransactionMonth              0
TransactionYear               0
TotalTransactionAmount        0
AverageTransactionAmount      0
TransactionCount              0
dtype: int64


In [13]:
# Feature 5: Encode Categorical Variables
categorical_features = ['ProductCategory', 'ChannelId', 'ProviderId']

# One-Hot Encoding for categorical features
data = pd.get_dummies(data, columns=categorical_features, drop_first=True)

In [15]:
# Feature 6: Normalize/Standardize Numerical Features
scaler = StandardScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Display the final dataset with new features
print("\nFinal Dataset with Engineered Features:")
data.head()


Final Dataset with Engineered Features:


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProductId,Amount,Value,...,ProductCategory_tv,ProductCategory_utility_bill,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5,ProviderId_ProviderId_2,ProviderId_ProviderId_3,ProviderId_ProviderId_4,ProviderId_ProviderId_5,ProviderId_ProviderId_6
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProductId_10,-0.046371,-0.072291,...,False,False,False,True,False,False,False,False,False,True
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProductId_6,-0.054643,-0.080251,...,False,False,True,False,False,False,False,True,False,False
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProductId_1,-0.050426,-0.076352,...,False,False,False,True,False,False,False,False,False,True
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProductId_21,0.107717,0.096648,...,False,True,False,True,False,False,False,False,False,False
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProductId_6,-0.059704,-0.075183,...,False,False,True,False,False,False,False,True,False,False


In [16]:

# Save the engineered dataset to a new CSV file
data.to_csv("../src/data/engineered_transactions.csv", index=False)