# Task 3 - Feature Engineering

importing necessary libraries

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer


In [2]:
def load_data2(file_path):
    return pd.read_csv(file_path)

# Load data2
file_path = '../data/data2.csv'
data = load_data2(file_path)

In [3]:
def data2_columnROW(data):
    num_rows, num_cols = data.shape
    data_types = data.dtypes

    print(f"Number of rows: {num_rows}\nNumber of columns: {num_cols}\n")
    print("Data types:")
    print(data_types)
    print("\n")

# Display overview of rows and columns
data2_columnROW(data)

Number of rows: 95662
Number of columns: 16

Data types:
TransactionId             int64
BatchId                   int64
AccountId                 int64
SubscriptionId            int64
CustomerId                int64
CurrencyCode             object
CountryCode               int64
ProviderId                int64
ProductId                 int64
ProductCategory          object
ChannelId                 int64
Amount                  float64
Value                     int64
TransactionStartTime     object
PricingStrategy           int64
FraudResult               int64
dtype: object




Create aggregate features for each customer

In [5]:
def create_aggregate_features(data):
    data['Amount'] = pd.to_numeric(data['Amount'], errors='coerce')
    aggregate_features = data.groupby('AccountId').agg(
        TotalTransactionAmount=('Amount', 'sum'),
        AverageTransactionAmount=('Amount', 'mean'),
        TransactionCount=('Amount', 'count'),
        StdDevTransactionAmount=('Amount', 'std')
    ).reset_index()
    return aggregate_features

# Feature Engineering
aggregate_features = create_aggregate_features(data)
print("Aggregate Features:")
print(aggregate_features.head())

Aggregate Features:
   AccountId  TotalTransactionAmount  AverageTransactionAmount  \
0          1                 70000.0              23333.333333   
1          2                 70000.0              35000.000000   
2          3                  5000.0               5000.000000   
3          4                 62000.0              62000.000000   
4          5                 40000.0              20000.000000   

   TransactionCount  StdDevTransactionAmount  
0                 3              5773.502692  
1                 2                 0.000000  
2                 1                      NaN  
3                 1                      NaN  
4                 2                 0.000000  


Extract Time Features:

 Extract Transaction Hour, Day, Month, Year

In [6]:

def extract_time_features(data):
    data['TransactionStartTime'] = pd.to_datetime(data['TransactionStartTime'])
    data['TransactionHour'] = data['TransactionStartTime'].dt.hour
    data['TransactionDay'] = data['TransactionStartTime'].dt.day
    data['TransactionMonth'] = data['TransactionStartTime'].dt.month
    data['TransactionYear'] = data['TransactionStartTime'].dt.year
    return data

# Feature Extraction
data = extract_time_features(data)
print("Data with Time Features:")
print(data[['TransactionStartTime', 'TransactionHour', 'TransactionDay', 'TransactionMonth', 'TransactionYear']].head())

Data with Time Features:
       TransactionStartTime  TransactionHour  TransactionDay  \
0 2018-11-15 02:18:49+00:00                2              15   
1 2018-11-15 02:19:08+00:00                2              15   
2 2018-11-15 02:44:21+00:00                2              15   
3 2018-11-15 03:32:55+00:00                3              15   
4 2018-11-15 03:34:21+00:00                3              15   

   TransactionMonth  TransactionYear  
0                11             2018  
1                11             2018  
2                11             2018  
3                11             2018  
4                11             2018  


Encode Categorical Variables

Identify categorical features

In [9]:
def Identify_categorical_features(data):
 categorical_features = data.select_dtypes(include=['object']).columns.tolist()
 print(f"Categorical Features: {categorical_features}")
 
Identify_categorical_features(data)

Categorical Features: ['CurrencyCode', 'ProductCategory']


 Encode Categorical Variables: One-Hot Encoding

In [10]:
# Encode Categorical Variables: One-Hot Encoding
def one_hot_encode(data, categorical_features):
    data = pd.get_dummies(data, columns=categorical_features)
    return data

categorical_features = data.select_dtypes(include=['object']).columns.tolist()
data_one_hot_encoded = one_hot_encode(data, categorical_features)
print("One-Hot Encoded Data:")
print(data_one_hot_encoded.head())

One-Hot Encoded Data:
   TransactionId  BatchId  AccountId  SubscriptionId  CustomerId  CountryCode  \
0          76871    36123       3957             887        4406          256   
1          73770    15642       4841            3829        4406          256   
2          26203    53941       4229             222        4683          256   
3            380   102363        648            2185         988          256   
4          28195    38780       4841            3829         988          256   

   ProviderId  ProductId  ChannelId   Amount  ...  CurrencyCode_UGX  \
0           6         10          3   1000.0  ...              True   
1           4          6          2    -20.0  ...              True   
2           6          1          3    500.0  ...              True   
3           1         21          3  20000.0  ...              True   
4           4          6          2   -644.0  ...              True   

  ProductCategory_airtime  ProductCategory_data_bundles  \
0    

 Encode Categorical Variables: Label Encoding

In [11]:
# Encode Categorical Variables: Label Encoding
def label_encode(data, categorical_features):
    le = LabelEncoder()
    for feature in categorical_features:
        data[feature] = le.fit_transform(data[feature].astype(str))
    return data

data_label_encoded = label_encode(data, categorical_features)
print("Label Encoded Data:")
print(data_label_encoded.head())

Label Encoded Data:
   TransactionId  BatchId  AccountId  SubscriptionId  CustomerId  \
0          76871    36123       3957             887        4406   
1          73770    15642       4841            3829        4406   
2          26203    53941       4229             222        4683   
3            380   102363        648            2185         988   
4          28195    38780       4841            3829         988   

   CurrencyCode  CountryCode  ProviderId  ProductId  ProductCategory  \
0             0          256           6         10                0   
1             0          256           4          6                2   
2             0          256           6          1                0   
3             0          256           1         21                8   
4             0          256           4          6                2   

   ChannelId   Amount  Value      TransactionStartTime  PricingStrategy  \
0          3   1000.0   1000 2018-11-15 02:18:49+00:00         

In [12]:
data.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult,TransactionHour,TransactionDay,TransactionMonth,TransactionYear
0,76871,36123,3957,887,4406,0,256,6,10,0,3,1000.0,1000,2018-11-15 02:18:49+00:00,2,0,2,15,11,2018
1,73770,15642,4841,3829,4406,0,256,4,6,2,2,-20.0,20,2018-11-15 02:19:08+00:00,2,0,2,15,11,2018
2,26203,53941,4229,222,4683,0,256,6,1,0,3,500.0,500,2018-11-15 02:44:21+00:00,2,0,2,15,11,2018
3,380,102363,648,2185,988,0,256,1,21,8,3,20000.0,21800,2018-11-15 03:32:55+00:00,2,0,3,15,11,2018
4,28195,38780,4841,3829,988,0,256,4,6,2,2,-644.0,644,2018-11-15 03:34:21+00:00,2,0,3,15,11,2018


Handle Missing Values


by using imputation

In [14]:
def handle_missing_values(data, strategy='mean'):
    numerical_features = data.select_dtypes(include=[np.number]).columns.tolist()
    imputer = SimpleImputer(strategy=strategy)
    data[numerical_features] = imputer.fit_transform(data[numerical_features])
    return data

# Handle Missing Values
data = handle_missing_values(data, strategy='mean')


In [15]:
def identify_missing_values(data):
    missing_values = data.isnull().sum()
    print("Missing Values:")
    print(missing_values[missing_values > 0])
    print("\n")
# Identify missing values
identify_missing_values(data)

Missing Values:
Series([], dtype: int64)




# Normalize/Standardize Numerical Features
Normalization and standardization are scaling techniques used to bring all numerical features onto a similar scale.


Normalize Features:

In [16]:
def normalize_features(data):
    numerical_features = data.select_dtypes(include=[np.number]).columns.tolist()
    scaler = MinMaxScaler()
    data[numerical_features] = scaler.fit_transform(data[numerical_features])
    return data

# Normalize Features
normalized_data = normalize_features(data.copy())
print("Normalized Data:")
print(normalized_data.head())

Normalized Data:
   TransactionId   BatchId  AccountId  SubscriptionId  CustomerId  \
0       0.546417  0.258949   0.817355        0.183078    0.589061   
1       0.524374  0.112122   1.000000        0.791684    0.589061   
2       0.186252  0.386684   0.873554        0.045511    0.626103   
3       0.002694  0.733818   0.133678        0.451593    0.131987   
4       0.200412  0.277996   1.000000        0.791684    0.131987   

   CurrencyCode  CountryCode  ProviderId  ProductId  ProductCategory  \
0           0.0          0.0         1.0   0.346154             0.00   
1           0.0          0.0         0.6   0.192308             0.25   
2           0.0          0.0         1.0   0.000000             0.00   
3           0.0          0.0         0.0   0.769231             1.00   
4           0.0          0.0         0.6   0.192308             0.25   

   ChannelId    Amount     Value      TransactionStartTime  PricingStrategy  \
0       0.50  0.092004  0.000101 2018-11-15 02:18:49+00:

Standardize Features:

In [17]:
def standardize_features(data):
    numerical_features = data.select_dtypes(include=[np.number]).columns.tolist()
    scaler = StandardScaler()
    data[numerical_features] = scaler.fit_transform(data[numerical_features])
    return data

# Standardize Features
standardized_data = standardize_features(data.copy())
print("Standardized Data:")
print(standardized_data.head())

Standardized Data:
   TransactionId   BatchId  AccountId  SubscriptionId  CustomerId  \
0       0.160893 -0.847664   0.381677       -1.650828    0.774769   
1       0.084563 -1.352296   0.948332        0.622801    0.774769   
2      -1.086272 -0.408646   0.556032       -2.164752    0.942550   
3      -1.721890  0.784424  -1.739433       -0.647711   -1.295536   
4      -1.037240 -0.782198   0.948332        0.622801   -1.295536   

   CurrencyCode  CountryCode  ProviderId  ProductId  ProductCategory  \
0           0.0          0.0    1.025848   0.554336        -0.799047   
1           0.0          0.0   -0.502027  -0.304790         0.491064   
2           0.0          0.0    1.025848  -1.378697        -0.799047   
3           0.0          0.0   -2.793841   2.916933         4.361398   
4           0.0          0.0   -0.502027  -0.304790         0.491064   

   ChannelId    Amount     Value      TransactionStartTime  PricingStrategy  \
0   0.676000 -0.046371 -0.072291 2018-11-15 02:18:49+0