In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import pandas as pd
import numpy as np

# Add the src directory to the Python path
sys.path.append('../src')
from feature_engineering import FeatureEngineering

# Feature Engineering

In [3]:
# Load the raw_data
raw_data = pd.read_csv('../data/raw/data.csv')

In [4]:
# Create a copy of the data to work on
processed_data = raw_data.copy()

In [5]:
#Initialize the class
data = FeatureEngineering(processed_data)

### Task 1: Create Aggregate Features

- **Total Transaction Amount**: Sum of all transaction amounts for each customer.
- **Average Transaction Amount**: Average transaction amount per customer.
- **Transaction Count**: Number of transactions per customer.
- **Standard Deviation of Transaction Amounts**: Variability of transaction amounts per customer.


In [6]:
# 1. Create aggregate features
agg_features = data.create_aggregate_features('CustomerId', 'Amount')


In [7]:
#display the first 5 rows
agg_features.head()

Unnamed: 0,CustomerId,total_transaction_amount,average_transaction_amount,transaction_count,std_transaction_amount
0,CustomerId_1,-10000.0,-10000.0,1,
1,CustomerId_10,-10000.0,-10000.0,1,
2,CustomerId_1001,20000.0,4000.0,5,6558.963333
3,CustomerId_1002,4225.0,384.090909,11,560.498966
4,CustomerId_1003,20000.0,3333.333333,6,6030.478146


### Extract Features

- **Transaction Hour**: The hour of the day when the transaction occurred.
- **Transaction Day**: The day of the month when the transaction occurred.
- **Transaction Month**: The month when the transaction occurred.
- **Transaction Year**: The year when the transaction occurred.


In [8]:

# 2. Extract features from the 'TransactionStartTime' column (assuming it's a datetime column)
df_with_extracted_features = data.extract_features('TransactionStartTime')


Date features extracted successfully.


In [11]:
processed_data.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult,Transaction_Year,Transaction_Month,Transaction_Day,Transaction_Hour
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15 02:18:49+00:00,2,0,2018,11,15,2
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15 02:19:08+00:00,2,0,2018,11,15,2
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15 02:44:21+00:00,2,0,2018,11,15,2
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15 03:32:55+00:00,2,0,2018,11,15,3
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15 03:34:21+00:00,2,0,2018,11,15,3


### Task 3: Encode Categorical Variables

Convert categorical variables into numerical format by using:

- **One-Hot Encoding**: Converts categorical values into binary vectors.
- **Label Encoding**: Assigns a unique integer to each category.

**One-Hot Encoded Columns:**
- **CurrencyCode**
- **ProductCategory**

**Label Encoded Columns:**
- **ProviderId**
- **ProductId**
- **ChannelId**
- **PricingStrategy**


In [12]:

# 3 Encode categorical variables (e.g., 'CountryCode', 'PricingStrategy')
df_onehot_encoded = data.encode_categorical_variables(['CurrencyCode', 'ProductCategory'], encoding_method='onehot')


In [13]:
df_label_encoded = data.encode_categorical_variables(categorical_columns=['ProviderId','ProductId','ChannelId','PricingStrategy'], encoding_method='label')


In [14]:
df_label_encoded.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CountryCode,ProviderId,ProductId,ChannelId,Amount,...,Transaction_Day,Transaction_Hour,ProductCategory_data_bundles,ProductCategory_financial_services,ProductCategory_movies,ProductCategory_other,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,256,5,1,2,1000.0,...,15,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,256,3,19,1,-20.0,...,15,2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,256,5,0,2,500.0,...,15,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,256,0,11,2,20000.0,...,15,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,256,3,19,1,-644.0,...,15,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


### Handle Missing Values

Use imputation or removal to handle missing values:

- **Imputation**: Filling missing values with mean, median, mode, or using more advanced methods like KNN imputation.
- **Removal**: Removing rows or columns with missing values if they are few.


In [15]:

# 4. Handle missing values in specific columns (e.g., 'Amount')
df_with_missing_values_handled = data.handle_missing_values(['Amount'], strategy='mean')


No missing values found in the specified columns.


### Normalize/Standardize Numerical Features

Normalization and standardization are scaling techniques used to bring all numerical features onto a similar scale.

- **Normalization**: Scales the data to a range of [0, 1].
- **Standardization**: Scales the data to have a mean of 0 and a standard deviation of 1.

The **Amount** and **Value** columns are scaled using both methods, while keeping the original columns intact.


In [None]:

# 5. Normalize or standardize numerical columns (e.g., 'TransactionAmount')
df_Standardized = data.normalize_standardize(['Amount','Value'], method='standardize', return_full_df=False)

In [18]:
df_Standardized.head()

Unnamed: 0,Amount_standardized,Value_standardized
0,-0.046371,-0.072291
1,-0.054643,-0.080251
2,-0.050426,-0.076352
3,0.107717,0.096648
4,-0.059704,-0.075183


In [21]:

# 5. Normalize or standardize numerical columns (e.g., 'TransactionAmount')
df_normalized = data.normalize_standardize(['Amount','Value'], method='standardize')

In [22]:
df_normalized.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CountryCode,ProviderId,ProductId,ChannelId,Amount,...,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill,Amount_standardized,Value_standardized,Amount_standardized.1,Value_standardized.1,Amount_standardized.2,Value_standardized.2
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,256,5,1,2,1000.0,...,0.0,0.0,0.0,0.0,-0.046371,-0.072291,-0.046371,-0.072291,-0.046371,-0.072291
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,256,3,19,1,-20.0,...,0.0,0.0,0.0,0.0,-0.054643,-0.080251,-0.054643,-0.080251,-0.054643,-0.080251
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,256,5,0,2,500.0,...,0.0,0.0,0.0,0.0,-0.050426,-0.076352,-0.050426,-0.076352,-0.050426,-0.076352
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,256,0,11,2,20000.0,...,0.0,0.0,0.0,1.0,0.107717,0.096648,0.107717,0.096648,0.107717,0.096648
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,256,3,19,1,-644.0,...,0.0,0.0,0.0,0.0,-0.059704,-0.075183,-0.059704,-0.075183,-0.059704,-0.075183


In [23]:
df_normalized.columns

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CountryCode', 'ProviderId', 'ProductId', 'ChannelId', 'Amount',
       'Value', 'TransactionStartTime', 'PricingStrategy', 'FraudResult',
       'Transaction_Year', 'Transaction_Month', 'Transaction_Day',
       'Transaction_Hour', 'ProductCategory_data_bundles',
       'ProductCategory_financial_services', 'ProductCategory_movies',
       'ProductCategory_other', 'ProductCategory_ticket',
       'ProductCategory_transport', 'ProductCategory_tv',
       'ProductCategory_utility_bill', 'Amount_standardized',
       'Value_standardized', 'Amount_standardized', 'Value_standardized',
       'Amount_standardized', 'Value_standardized'],
      dtype='object')

## Save the Processed data for future use.

In [26]:
df_normalized.to_csv('../data/processed/processed_data.csv', index=False)
print('Data saved successfully')

Data saved successfully
