In [31]:
import sys
import os
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Add the correct relative path to the scripts folder
sys.path.append(os.path.abspath('../scripts'))
sys.path.append(os.path.abspath('../data'))

In [32]:
data = pd.read_csv(r'C:\Users\befekadum\Documents\10x acadamy\week6\Credit-Scoring-Model-\data\cleaned.csv')

Create Aggregate Features

In [33]:
# Create aggregate features per customer
customer_features = data.groupby('CustomerId').agg(
    total_transaction_amount=pd.NamedAgg(column='Amount', aggfunc='sum'),
    avg_transaction_amount=pd.NamedAgg(column='Amount', aggfunc='mean'),
    transaction_count=pd.NamedAgg(column='TransactionId', aggfunc='count'),
    std_transaction_amount=pd.NamedAgg(column='Amount', aggfunc='std')
).reset_index()

# Display the first few rows of aggregated features
customer_features.head()

Unnamed: 0,CustomerId,total_transaction_amount,avg_transaction_amount,transaction_count,std_transaction_amount
0,CustomerId_1,-10000.0,-10000.0,1,
1,CustomerId_10,-10000.0,-10000.0,1,
2,CustomerId_1001,20000.0,4000.0,5,6558.963333
3,CustomerId_1002,4225.0,384.090909,11,560.498966
4,CustomerId_1003,20000.0,3333.333333,6,6030.478146


Extract Features

In [34]:
# Convert 'TransactionStartTime' to datetime format
data['TransactionStartTime'] = pd.to_datetime(data['TransactionStartTime'])

# Extract time-based features
data['transaction_hour'] = data['TransactionStartTime'].dt.hour
data['transaction_day'] = data['TransactionStartTime'].dt.day
data['transaction_month'] = data['TransactionStartTime'].dt.month
data['transaction_year'] = data['TransactionStartTime'].dt.year

# Display the first few rows with the new features
data[['TransactionStartTime', 'transaction_hour', 'transaction_day', 'transaction_month', 'transaction_year']].head()


Unnamed: 0,TransactionStartTime,transaction_hour,transaction_day,transaction_month,transaction_year
0,2018-11-15 02:18:49+00:00,2,15,11,2018
1,2018-11-15 02:19:08+00:00,2,15,11,2018
2,2018-11-15 02:44:21+00:00,2,15,11,2018
3,2018-11-15 03:32:55+00:00,3,15,11,2018
4,2018-11-15 03:34:21+00:00,3,15,11,2018


 Encode Categorical Variables

In [35]:
from data_prep import data_pro
data_pro_new = data_pro()

In [36]:
categorical_columns = data_pro_new.get_categorical_columns(data)
categorical_columns

['TransactionId',
 'BatchId',
 'AccountId',
 'SubscriptionId',
 'CustomerId',
 'CurrencyCode',
 'ProviderId',
 'ProductId',
 'ProductCategory',
 'ChannelId']

Label Encoding

In [37]:
# Label Encoding for a categorical column
le = LabelEncoder()
data['ProductCategory_encoded'] = le.fit_transform(data['ProductCategory'])

# Display the first few rows
print(data[['ProductCategory', 'ProductCategory_encoded']].head())

      ProductCategory  ProductCategory_encoded
0             airtime                        0
1  financial_services                        2
2             airtime                        0
3        utility_bill                        8
4  financial_services                        2


Normalize/Standardize Numerical Features

In [38]:

# Select numerical features for normalization
numerical_columns = ['CountryCode', 'Amount', 'Value', 'PricingStrategy', 'FraudResult']

# Initialize the scaler
scaler = MinMaxScaler()

# Fit and transform the data
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Display the first few rows of normalized features
data[numerical_columns].head()

Unnamed: 0,CountryCode,Amount,Value,PricingStrategy,FraudResult
0,0.0,0.092004,0.000101,0.5,0.0
1,0.0,0.09191,2e-06,0.5,0.0
2,0.0,0.091958,5e-05,0.5,0.0
3,0.0,0.09375,0.002206,0.5,0.0
4,0.0,0.091853,6.5e-05,0.5,0.0


Standardization: Scales the data to have a mean of 0 and standard deviation of 1.

In [39]:
# Initialize the standard scaler
scaler = StandardScaler()

# Fit and transform the data
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Display the first few rows of standardized features
data[numerical_columns].head()


Unnamed: 0,CountryCode,Amount,Value,PricingStrategy,FraudResult
0,0.0,-0.046371,-0.072291,-0.349252,-0.044962
1,0.0,-0.054643,-0.080251,-0.349252,-0.044962
2,0.0,-0.050426,-0.076352,-0.349252,-0.044962
3,0.0,0.107717,0.096648,-0.349252,-0.044962
4,0.0,-0.059704,-0.075183,-0.349252,-0.044962
