In [6]:
# Import Libraries and Load Raw Data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

import sys
import os

module_path = os.path.abspath(os.path.join('..', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)

from data_processing import get_data_processing_pipeline, generate_proxy_target

try:
    raw_df = pd.read_csv('../data/raw/data.csv')
    print("Raw data loaded successfully!")
    print(f"Raw data shape: {raw_df.shape}")
    print("Raw data head:")
    print(raw_df.head())
except FileNotFoundError:
    print("Error: data.csv not found. Make sure it's in the '../data/raw/' directory.")

Raw data loaded successfully!
Raw data shape: (95662, 16)
Raw data head:
         TransactionId         BatchId       AccountId       SubscriptionId  \
0  TransactionId_76871   BatchId_36123  AccountId_3957   SubscriptionId_887   
1  TransactionId_73770   BatchId_15642  AccountId_4841  SubscriptionId_3829   
2  TransactionId_26203   BatchId_53941  AccountId_4229   SubscriptionId_222   
3    TransactionId_380  BatchId_102363   AccountId_648  SubscriptionId_2185   
4  TransactionId_28195   BatchId_38780  AccountId_4841  SubscriptionId_3829   

        CustomerId CurrencyCode  CountryCode    ProviderId     ProductId  \
0  CustomerId_4406          UGX          256  ProviderId_6  ProductId_10   
1  CustomerId_4406          UGX          256  ProviderId_4   ProductId_6   
2  CustomerId_4683          UGX          256  ProviderId_6   ProductId_1   
3   CustomerId_988          UGX          256  ProviderId_1  ProductId_21   
4   CustomerId_988          UGX          256  ProviderId_4   ProductId_6

In [7]:
feature_pipeline = get_data_processing_pipeline()

processed_data_array = feature_pipeline.fit_transform(raw_df)

print(f"\nShape of processed data (NumPy array): {processed_data_array.shape}")
print("\nFirst 5 rows of processed data (NumPy array):")
print(processed_data_array[:5])


Shape of processed data (NumPy array): (95662, 59)

First 5 rows of processed data (NumPy array):
[[-0.04637113647159194 -0.07229109433980145 -0.34925239001574804 1.0 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
  1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 'TransactionId_76871' 'BatchId_36123' 'AccountId_3957'
  'SubscriptionId_887' 'CustomerId_4406' 'UGX' 256
  Timestamp('2018-11-15 02:18:49+0000', tz='UTC') 0 2 3 11 2018 0]
 [-0.0546432297768206 -0.08025071507026395 -0.34925239001574804 0.0 0.0
  1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  1.0 0.0 0.0 0.0 'TransactionId_73770' 'BatchId_15642' 'AccountId_4841'
  'SubscriptionId_3829' 'CustomerId_4406' 'UGX' 256
  Timestamp('2018-11-15 02:19:08+0000', tz='UTC') 0 2 3 11 2018 1]
 [-0.05042608417023344 -0.0763521253247313 -0.34925239001574804 1.0 0.0
  0.0

In [10]:
def get_feature_names_from_preprocessor(column_transformer):
    output_features = []
    for name, estimator, features in column_transformer.transformers_:
        if name == 'remainder':
            output_features.extend(features)
        elif hasattr(estimator, 'get_feature_names_out'):
            output_features.extend(estimator.get_feature_names_out(features))
        else:
            output_features.extend(features)
    return output_features

preprocessor_step = feature_pipeline.named_steps['preprocessor']

num_features_out = preprocessor_step.named_transformers_['num'].get_feature_names_out()
cat_features_out = preprocessor_step.named_transformers_['cat'].get_feature_names_out()

df_before_preprocessor = feature_pipeline.named_steps['datetime_extractor'].transform(raw_df.copy())
df_before_preprocessor = feature_pipeline.named_steps['amount_handler'].transform(df_before_preprocessor)

processed_by_preprocessor = ['Amount', 'Value', 'PricingStrategy',
                             'ProductCategory', 'ChannelId', 'ProviderId', 'ProductId']

passthrough_cols_final = [col for col in df_before_preprocessor.columns if col not in processed_by_preprocessor]

final_feature_names = list(num_features_out) + \
                      list(cat_features_out) + \
                      passthrough_cols_final

print("\n--- Final Feature Names (for DataFrame reconstruction) ---")
print(final_feature_names)
print(f"Number of final features: {len(final_feature_names)}")

numeric_cols_to_convert = list(num_features_out) + \
                          list(cat_features_out) + \
                          ['transaction_hour', 'transaction_day_of_week', 'transaction_month',
                           'transaction_year', 'is_refund', 'FraudResult']

try:
    processed_df = pd.DataFrame(processed_data_array, columns=final_feature_names)

    for col in numeric_cols_to_convert:
        if col in processed_df.columns:
            processed_df[col] = pd.to_numeric(processed_df[col], errors='coerce')

    print("\nProcessed Data (DataFrame with column names):")
    print(processed_df.head())
    print(f"\nProcessed data shape (DataFrame): {processed_df.shape}")
    print("\nProcessed data info:")
    processed_df.info()

except Exception as e:
    print(f"\nCould not reconstruct DataFrame with all column names easily: {e}")
    print("This is expected for complex ColumnTransformers. The raw array is still valid.")
    print("You can inspect processed_data_array directly.")



--- Final Feature Names (for DataFrame reconstruction) ---
['Amount', 'Value', 'PricingStrategy', 'ProductCategory_airtime', 'ProductCategory_data_bundles', 'ProductCategory_financial_services', 'ProductCategory_movies', 'ProductCategory_other', 'ProductCategory_ticket', 'ProductCategory_transport', 'ProductCategory_tv', 'ProductCategory_utility_bill', 'ChannelId_ChannelId_1', 'ChannelId_ChannelId_2', 'ChannelId_ChannelId_3', 'ChannelId_ChannelId_5', 'ProviderId_ProviderId_1', 'ProviderId_ProviderId_2', 'ProviderId_ProviderId_3', 'ProviderId_ProviderId_4', 'ProviderId_ProviderId_5', 'ProviderId_ProviderId_6', 'ProductId_ProductId_1', 'ProductId_ProductId_10', 'ProductId_ProductId_11', 'ProductId_ProductId_12', 'ProductId_ProductId_13', 'ProductId_ProductId_14', 'ProductId_ProductId_15', 'ProductId_ProductId_16', 'ProductId_ProductId_19', 'ProductId_ProductId_2', 'ProductId_ProductId_20', 'ProductId_ProductId_21', 'ProductId_ProductId_22', 'ProductId_ProductId_23', 'ProductId_ProductId

In [11]:
# Generate and Inspect Proxy Target Variable ---
if 'raw_df' not in locals():
    print("raw_df not found. Please run Cell 1 first to load raw data.")
proxy_target_df = generate_proxy_target(raw_df.copy()) 

print("\n--- Proxy Target Variable (is_high_risk) ---")
print(proxy_target_df.head())

print("\n--- Distribution of is_high_risk Proxy Target ---")
print(proxy_target_df['is_high_risk'].value_counts())
print(proxy_target_df['is_high_risk'].value_counts(normalize=True) * 100)



RFM Cluster Means (Scaled):
         Recency_Scaled  Frequency_Scaled  Monetary_Scaled
Cluster                                                   
0              0.443276          0.004167         0.002551
1              0.080559          0.008996         0.002899
2              0.808647          0.001454         0.001040

Identified high-risk cluster (based on Risk_Score heuristic): 2

--- Proxy Target Variable (is_high_risk) ---
                 is_high_risk
CustomerId                   
CustomerId_1                1
CustomerId_10               1
CustomerId_1001             1
CustomerId_1002             0
CustomerId_1003             0

--- Distribution of is_high_risk Proxy Target ---
is_high_risk
0    2971
1     771
Name: count, dtype: int64
is_high_risk
0    79.396045
1    20.603955
Name: proportion, dtype: float64


In [12]:
# Merge Proxy Target into Processed Data 
if 'processed_df' not in locals():
    print("processed_df not found. Please run Cell 3 first.")

if 'proxy_target_df' not in locals():
    print("proxy_target_df not found. Please run Cell 4 first.")
proxy_target_df_reset = proxy_target_df.reset_index()

final_processed_df = pd.merge(
    processed_df,
    proxy_target_df_reset[['CustomerId', 'is_high_risk']],
    on='CustomerId',
    how='left'
)

print("\n--- Final Processed DataFrame with 'is_high_risk' Target ---")
print(final_processed_df.head())
print(f"\nFinal Processed data shape: {final_processed_df.shape}")
print("\nFinal Processed data info:")
final_processed_df.info()

print("\n--- Distribution of 'is_high_risk' in Final Processed Data ---")
print(final_processed_df['is_high_risk'].value_counts())
print(final_processed_df['is_high_risk'].value_counts(normalize=True) * 100)

try:
    processed_output_path = '../data/processed/final_processed_data.csv'
    final_processed_df.to_csv(processed_output_path, index=False)
    print(f"\nFinal processed data saved to: {processed_output_path}")
except Exception as e:
    print(f"\nError saving processed data: {e}")



--- Final Processed DataFrame with 'is_high_risk' Target ---
     Amount     Value  PricingStrategy  ProductCategory_airtime  \
0 -0.046371 -0.072291        -0.349252                      1.0   
1 -0.054643 -0.080251        -0.349252                      0.0   
2 -0.050426 -0.076352        -0.349252                      1.0   
3  0.107717  0.096648        -0.349252                      0.0   
4 -0.059704 -0.075183        -0.349252                      0.0   

   ProductCategory_data_bundles  ProductCategory_financial_services  \
0                           0.0                                 0.0   
1                           0.0                                 1.0   
2                           0.0                                 0.0   
3                           0.0                                 0.0   
4                           0.0                                 1.0   

   ProductCategory_movies  ProductCategory_other  ProductCategory_ticket  \
0                     0.0       