In [6]:
import pandas as pd

In [8]:
df = pd.read_csv('../data/data.csv')
print(df.head())

         TransactionId         BatchId       AccountId       SubscriptionId  \
0  TransactionId_76871   BatchId_36123  AccountId_3957   SubscriptionId_887   
1  TransactionId_73770   BatchId_15642  AccountId_4841  SubscriptionId_3829   
2  TransactionId_26203   BatchId_53941  AccountId_4229   SubscriptionId_222   
3    TransactionId_380  BatchId_102363   AccountId_648  SubscriptionId_2185   
4  TransactionId_28195   BatchId_38780  AccountId_4841  SubscriptionId_3829   

        CustomerId CurrencyCode  CountryCode    ProviderId     ProductId  \
0  CustomerId_4406          UGX          256  ProviderId_6  ProductId_10   
1  CustomerId_4406          UGX          256  ProviderId_4   ProductId_6   
2  CustomerId_4683          UGX          256  ProviderId_6   ProductId_1   
3   CustomerId_988          UGX          256  ProviderId_1  ProductId_21   
4   CustomerId_988          UGX          256  ProviderId_4   ProductId_6   

      ProductCategory    ChannelId   Amount  Value  TransactionStart

In [9]:
# Total Transaction Amount per Customer
df['Total_Transaction_Amount'] = df.groupby('AccountId')['Amount'].transform('sum')

# Average Transaction Amount per Customer
df['Average_Transaction_Amount'] = df.groupby('AccountId')['Amount'].transform('mean')

# Transaction Count per Customer
df['Transaction_Count'] = df.groupby('AccountId')['TransactionId'].transform('count')

# Standard Deviation of Transaction Amounts per Customer
df['Std_Transaction_Amount'] = df.groupby('AccountId')['Amount'].transform('std')

# Replace NaN values generated due to customers with only one transaction
df['Std_Transaction_Amount'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Std_Transaction_Amount'].fillna(0, inplace=True)


In [10]:
# Convert 'TransactionStartTime' to datetime format
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])

# Extract hour, day, month, and year from the transaction start time
df['Transaction_Hour'] = df['TransactionStartTime'].dt.hour
df['Transaction_Day'] = df['TransactionStartTime'].dt.day
df['Transaction_Month'] = df['TransactionStartTime'].dt.month
df['Transaction_Year'] = df['TransactionStartTime'].dt.year


In [12]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Example of Label Encoding
label_encoder = LabelEncoder()

# Encoding 'CountryCode' using Label Encoding
df['CountryCode_Encoded'] = label_encoder.fit_transform(df['CountryCode'])

# Example of One-Hot Encoding
one_hot_encoded_df = pd.get_dummies(df['ChannelId'], prefix='ChannelId')

# Concatenate one-hot encoded columns back to the original dataframe
df = pd.concat([df, one_hot_encoded_df], axis=1)


In [13]:
# Handling missing values with imputation

# Impute missing values in numerical columns with median
df['Amount'].fillna(df['Amount'].median(), inplace=True)

# Impute missing values in categorical columns with mode
df['CurrencyCode'].fillna(df['CurrencyCode'].mode()[0], inplace=True)

# Removing rows with missing values in specific columns (if the number of rows with missing values is small)
df.dropna(subset=['TransactionId', 'AccountId'], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Amount'].fillna(df['Amount'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['CurrencyCode'].fillna(df['CurrencyCode'].mode()[0], inplace=True)


In [14]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Normalization
scaler = MinMaxScaler()
df['Normalized_Amount'] = scaler.fit_transform(df[['Amount']])

# Standardization
std_scaler = StandardScaler()
df['Standardized_Amount'] = std_scaler.fit_transform(df[['Amount']])


In [36]:
import pandas as pd
import category_encoders as ce

# Assuming df is your original DataFrame
# Separating features and target variable
X = df.drop(columns=['FraudResult'])
y = df['FraudResult']

# Initializing Weight of Evidence transformer
woe_encoder = ce.WOEEncoder(cols=X.columns.tolist())

# Fitting and transforming the features with the target variable
X_woe = woe_encoder.fit_transform(X, y)

# Adding the transformed features to the original dataframe
df_woe = pd.concat([X_woe, y], axis=1)


AttributeError: 'DataFrame' object has no attribute 'unique'

In [33]:
import category_encoders as ce

# Assuming 'ProductCategory' and 'FraudResult' are columns
woe_encoder = ce.WOEEncoder(cols=['ProductCategory'])
woe_encoded_df = woe_encoder.fit_transform(df['ProductCategory'], df['FraudResult'])

# Add WoE transformed features to the original dataframe
df = pd.concat([df, woe_encoded_df], axis=1)