## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore') 

In [5]:
# Define optimal data types for columns
dtypes = {
    'step': 'int16',
    'type': 'category',
    'amount': 'float32',
    'nameOrig': 'object',
    'oldbalanceOrg': 'float32',
    'newbalanceOrig': 'float32',
    'nameDest': 'object',
    'oldbalanceDest': 'float32',
    'newbalanceDest': 'float32',
    'isFraud': 'int8',
    'isFlaggedFraud': 'int8'
}

# Load data
try:
    df = pd.read_csv(r'C:\Users\anura\Desktop\Task 3\data\Fraud.csv', dtype=dtypes)
    print("Data loaded successfully!")
except FileNotFoundError:
    print("Error: File not found. Check the path or filename.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Data loaded successfully!


In [6]:
# Display basic info
print(f"Shape: {df.shape}\n")
print(df.info())

# Summary statistics
print("\nSummary Statistics:")
print(df.describe())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Check class distribution of target variable
print("\nFraud Distribution:")
print(df['isFraud'].value_counts(normalize=True) * 100)

Shape: (6362620, 11)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype   
---  ------          -----   
 0   step            int16   
 1   type            category
 2   amount          float32 
 3   nameOrig        object  
 4   oldbalanceOrg   float32 
 5   newbalanceOrig  float32 
 6   nameDest        object  
 7   oldbalanceDest  float32 
 8   newbalanceDest  float32 
 9   isFraud         int8    
 10  isFlaggedFraud  int8    
dtypes: category(1), float32(5), int16(1), int8(2), object(2)
memory usage: 248.8+ MB
None

Summary Statistics:
               step        amount  oldbalanceOrg  newbalanceOrig  \
count  6.362620e+06  6.362620e+06   6.362620e+06    6.362620e+06   
mean   2.433972e+02  1.798619e+05   8.338834e+05    8.551137e+05   
std    1.423320e+02  6.038582e+05   2.888242e+06    2.924048e+06   
min    1.000000e+00  0.000000e+00   0.000000e+00    0.000000e+00   
25%    1.560000e+02  1.

In [None]:
# Check transaction type distribution
print("Transaction Types:")
print(df['type'].value_counts())


Transaction Types:
type
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: count, dtype: int64


In [8]:
# Cross-check 'isFlaggedFraud' vs. 'isFraud'
flagged_but_not_fraud = df[(df['isFlaggedFraud'] == 1) & (df['isFraud'] == 0)]
print(f"\nTransactions flagged as fraud but not actual fraud: {len(flagged_but_not_fraud)}")

# Expected: 'isFlaggedFraud' should be a subset of 'isFraud'
# If discrepancies exist, investigate business rules (e.g., threshold issues)


Transactions flagged as fraud but not actual fraud: 0


In [9]:
# Drop columns unused in modeling (IDs and redundant flags)
df.drop(columns=['nameOrig', 'nameDest', 'isFlaggedFraud'], inplace=True, errors='ignore')
print("\nColumns after dropping irrelevants:")
print(df.columns)


Columns after dropping irrelevants:
Index(['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig',
       'oldbalanceDest', 'newbalanceDest', 'isFraud'],
      dtype='object')


In [None]:
print("\nOptimized Memory Usage:")
print(df.memory_usage(deep=True).sum() / 1024**2, "MB")


Optimized Memory Usage:
145.6293716430664 MB


In [13]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# 1. Encode categorical 'type' safely
if 'type' in df.columns:
    encoder = OneHotEncoder(drop='first', sparse_output=False)
    type_encoded = encoder.fit_transform(df[['type']])
    type_cols = [f"type_{cat}" for cat in encoder.categories_[0][1:]]  # Skip first category
    type_encoded_df = pd.DataFrame(type_encoded, columns=type_cols)
    df = pd.concat([df, type_encoded_df], axis=1).drop(columns=['type'])

In [14]:
# 2. Create numerical features
df['balanceChangeOrg'] = df['oldbalanceOrg'] - df['newbalanceOrig']
df['balanceChangeDest'] = df['oldbalanceDest'] - df['newbalanceDest']

In [15]:
# 3. Log-transform skewed features (handle zeros)
df['log_amount'] = np.log1p(df['amount'])  # log(1+x) to avoid -inf
df.drop(columns=['amount'], inplace=True)

In [16]:
# 4. Ensure all features are numeric (critical for model)
numeric_cols = df.select_dtypes(include=['number']).columns
df = df[numeric_cols].copy()

In [17]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier

# 1. Separate features/target
X = df.drop(columns=['isFraud'])
y = df['isFraud']

In [18]:
# 2. Verify no objects/strings remain
assert all([np.issubdtype(dtype, np.number) for dtype in X.dtypes]), \
    "Non-numeric columns detected: " + str(X.select_dtypes(exclude='number').columns.tolist())

In [19]:
# 3. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [20]:
# 4. Train LightGBM (optimized for imbalance)
model = LGBMClassifier(
    objective='binary',
    class_weight='balanced',
    n_estimators=500,
    learning_rate=0.05,
    random_state=42
)
model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 5749, number of negative: 4448085
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.087348 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 4453834, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


In [21]:
# 5. Evaluate
from sklearn.metrics import classification_report
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1906322
           1       0.48      0.99      0.64      2464

    accuracy                           1.00   1908786
   macro avg       0.74      0.99      0.82   1908786
weighted avg       1.00      1.00      1.00   1908786

