In [60]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin


In [61]:

# 1. Load data and inspect columns
df = pd.read_csv('../data/raw/data.csv')
print(df.columns)  # <-- Run this cell first to see your real columns
df.head()          # <-- Optional: see a preview


Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId',
       'ProductCategory', 'ChannelId', 'Amount', 'Value',
       'TransactionStartTime', 'PricingStrategy', 'FraudResult'],
      dtype='object')


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


In [62]:

CUSTOMER_ID_COL = 'CustomerId'  # update as needed
AMOUNT_COL = 'Amount'           # update as needed
DATETIME_COL = 'TransactionStartTime'  # update as needed

# Feature Engineering Functions
def extract_datetime_features(df):
    df = df.copy()
    df['transaction_datetime'] = pd.to_datetime(df[DATETIME_COL])
    df['transaction_hour'] = df['transaction_datetime'].dt.hour
    df['transaction_day'] = df['transaction_datetime'].dt.day
    df['transaction_month'] = df['transaction_datetime'].dt.month
    df['transaction_year'] = df['transaction_datetime'].dt.year
    return df


In [63]:
# df_datetime_features = extract_datetime_features(df)
# print(df_datetime_features.head())

In [64]:

def create_aggregate_features(df):
    agg = df.groupby(CUSTOMER_ID_COL).agg(
        total_transaction_amount=(AMOUNT_COL, 'sum'),
        avg_transaction_amount=(AMOUNT_COL, 'mean'),
        transaction_count=(AMOUNT_COL, 'count'),
        std_transaction_amount=(AMOUNT_COL, 'std')
    ).reset_index()
    return agg


In [65]:
# create_aggregate_features(df)

# create_aggregate_features = create_aggregate_features(df)
# print(create_aggregate_features.head())

In [66]:

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = extract_datetime_features(X)
        agg = create_aggregate_features(X)
        X = X.merge(agg, on=CUSTOMER_ID_COL, how='left')
        return X


In [67]:
# Instantiate the FeatureEngineer and transform the dataframe
# feature_engineer = FeatureEngineer()
# df_features = feature_engineer.transform(df)

# print(df_features.head())

In [68]:

# 3. Define your categorical and numerical columns based on your data
# Exclude ID and datetime columns from categoricals
exclude_cols = [CUSTOMER_ID_COL, DATETIME_COL]
categorical_cols = [
    col for col in df.select_dtypes(include=['object', 'category']).columns
    if col not in exclude_cols
]

numerical_cols = [
    AMOUNT_COL, 'total_transaction_amount', 'avg_transaction_amount',
    'transaction_count', 'std_transaction_amount', 'transaction_hour',
    'transaction_day', 'transaction_month', 'transaction_year'
]

In [69]:
# print("Categorical columns:", categorical_cols)

In [None]:

def build_pipeline(categorical_cols, numerical_cols, scaler_type='standard'):
    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    numerical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler() if scaler_type == 'standard' else MinMaxScaler())
    ])
    preprocessor = ColumnTransformer([
        ('cat', categorical_transformer, categorical_cols),
        ('num', numerical_transformer, numerical_cols)
    ])
    pipeline = Pipeline([
        ('feature_engineer', FeatureEngineer()),
        ('preprocessor', preprocessor)
    ])
    return pipeline

# Build and run pipeline
pipeline = build_pipeline(categorical_cols, numerical_cols, scaler_type='standard')
X_processed = pipeline.fit_transform(df)
print("Processed feature shape:", X_processed.shape)


Processed feature shape: (95662, 197783)
