In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

import sys
import os

module_path = os.path.abspath(os.path.join('..', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)

from data_processing import get_data_processing_pipeline

try:
    raw_df = pd.read_csv('../data/raw/data.csv')
    print("Raw data loaded successfully!")
    print(f"Raw data shape: {raw_df.shape}")
    print("Raw data head:")
    print(raw_df.head())
except FileNotFoundError:
    print("Error: data.csv not found. Make sure it's in the '../data/raw/' directory.")

Raw data loaded successfully!
Raw data shape: (95662, 16)
Raw data head:
         TransactionId         BatchId       AccountId       SubscriptionId  \
0  TransactionId_76871   BatchId_36123  AccountId_3957   SubscriptionId_887   
1  TransactionId_73770   BatchId_15642  AccountId_4841  SubscriptionId_3829   
2  TransactionId_26203   BatchId_53941  AccountId_4229   SubscriptionId_222   
3    TransactionId_380  BatchId_102363   AccountId_648  SubscriptionId_2185   
4  TransactionId_28195   BatchId_38780  AccountId_4841  SubscriptionId_3829   

        CustomerId CurrencyCode  CountryCode    ProviderId     ProductId  \
0  CustomerId_4406          UGX          256  ProviderId_6  ProductId_10   
1  CustomerId_4406          UGX          256  ProviderId_4   ProductId_6   
2  CustomerId_4683          UGX          256  ProviderId_6   ProductId_1   
3   CustomerId_988          UGX          256  ProviderId_1  ProductId_21   
4   CustomerId_988          UGX          256  ProviderId_4   ProductId_6

In [2]:
feature_pipeline = get_data_processing_pipeline()

processed_data_array = feature_pipeline.fit_transform(raw_df)

print(f"\nShape of processed data (NumPy array): {processed_data_array.shape}")
print("\nFirst 5 rows of processed data (NumPy array):")
print(processed_data_array[:5])


Shape of processed data (NumPy array): (95662, 59)

First 5 rows of processed data (NumPy array):
[[-0.04637113647159194 -0.07229109433980145 -0.34925239001574804 1.0 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
  1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 'TransactionId_76871' 'BatchId_36123' 'AccountId_3957'
  'SubscriptionId_887' 'CustomerId_4406' 'UGX' 256
  Timestamp('2018-11-15 02:18:49+0000', tz='UTC') 0 2 3 11 2018 0]
 [-0.0546432297768206 -0.08025071507026395 -0.34925239001574804 0.0 0.0
  1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  1.0 0.0 0.0 0.0 'TransactionId_73770' 'BatchId_15642' 'AccountId_4841'
  'SubscriptionId_3829' 'CustomerId_4406' 'UGX' 256
  Timestamp('2018-11-15 02:19:08+0000', tz='UTC') 0 2 3 11 2018 1]
 [-0.05042608417023344 -0.0763521253247313 -0.34925239001574804 1.0 0.0
  0.0

In [5]:
def get_feature_names(column_transformer):
    output_features = []
    for name, estimator, features in column_transformer.transformers_:
        if name == 'remainder':
            output_features.extend(features)
        elif hasattr(estimator, 'get_feature_names_out'):
            output_features.extend(estimator.get_feature_names_out(features))
        else:
            output_features.extend(features)
    return output_features

preprocessor_step = feature_pipeline.named_steps['preprocessor']

# Get feature names from numerical and categorical transformers
num_features_out = feature_pipeline.named_steps['preprocessor'].named_transformers_['num'].get_feature_names_out()
cat_features_out = feature_pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out()
original_passthrough_cols = [
    'TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
    'CurrencyCode', 'CountryCode', 'TransactionStartTime', 'FraudResult'
]
new_derived_features = [
    'transaction_hour', 'transaction_day_of_week', 'transaction_month',
    'transaction_year', 'is_refund'
]
model_relevant_passthrough_cols = [
    'TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
    'TransactionStartTime', 'FraudResult',
    'transaction_hour', 'transaction_day_of_week', 'transaction_month',
    'transaction_year', 'is_refund'
]
numeric_cols_in_final_df = list(num_features_out) + list(cat_features_out) + new_derived_features + ['FraudResult']

processed_df = pd.DataFrame(processed_data_array, columns=final_feature_names)
for col in numeric_cols_in_final_df:
    if col in processed_df.columns:
        processed_df[col] = pd.to_numeric(processed_df[col], errors='coerce')

print("\nProcessed Data (DataFrame with column names):")
print(processed_df.head())
print(f"\nProcessed data shape (DataFrame): {processed_df.shape}")
print("\nProcessed data info:")
processed_df.info()


Processed Data (DataFrame with column names):
     Amount     Value  PricingStrategy  ProductCategory_airtime  \
0 -0.046371 -0.072291        -0.349252                      1.0   
1 -0.054643 -0.080251        -0.349252                      0.0   
2 -0.050426 -0.076352        -0.349252                      1.0   
3  0.107717  0.096648        -0.349252                      0.0   
4 -0.059704 -0.075183        -0.349252                      0.0   

   ProductCategory_data_bundles  ProductCategory_financial_services  \
0                           0.0                                 0.0   
1                           0.0                                 1.0   
2                           0.0                                 0.0   
3                           0.0                                 0.0   
4                           0.0                                 1.0   

   ProductCategory_movies  ProductCategory_other  ProductCategory_ticket  \
0                     0.0                    0.