
# Task 8: Feature Engineering and Model Tuning

## Objective
Improve model performance through feature engineering and hyperparameter tuning, and build a decision tree to detect fraudulent transactions.


In [5]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import warnings
from sklearn.exceptions import UndefinedMetricWarning

# Suppress warnings for undefined metrics
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)


In [6]:

# Load dataset
df = pd.read_csv("fraud_detection.csv")
df.head()


Unnamed: 0,TransactionID,Amount,Type,IsFraud
0,1,100,credit,0
1,2,200,debit,1
2,3,150,credit,0
3,4,1000,debit,1
4,5,50,credit,0


In [7]:

# Check for missing values
print("Missing values:\n", df.isnull().sum())

# Label encoding for 'Type'
le = LabelEncoder()
df['Type'] = le.fit_transform(df['Type'])  # credit = 0, debit = 1
df.head()


Missing values:
 TransactionID    0
Amount           0
Type             0
IsFraud          0
dtype: int64


Unnamed: 0,TransactionID,Amount,Type,IsFraud
0,1,100,0,0
1,2,200,1,1
2,3,150,0,0
3,4,1000,1,1
4,5,50,0,0


In [8]:

# Create a new feature: amount per transaction type average
type_avg = df.groupby('Type')['Amount'].transform('mean')
df['RelativeAmount'] = df['Amount'] / type_avg
df.head()


Unnamed: 0,TransactionID,Amount,Type,IsFraud,RelativeAmount
0,1,100,0,0,0.320513
1,2,200,1,1,0.421941
2,3,150,0,0,0.480769
3,4,1000,1,1,2.109705
4,5,50,0,0,0.160256


In [9]:

X = df.drop(['TransactionID', 'IsFraud'], axis=1)
y = df['IsFraud']

# Use stratify to ensure both classes are represented in the test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


In [10]:

# Train Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=0))


              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



In [11]:

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'max_depth': [2, 3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

grid = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=3, scoring='f1')
grid.fit(X_train, y_train)

# Best model evaluation
best_model = grid.best_estimator_
y_pred_best = best_model.predict(X_test)
print("Best Parameters:", grid.best_params_)
print(classification_report(y_test, y_pred_best, zero_division=0))


Best Parameters: {'criterion': 'gini', 'max_depth': 2, 'min_samples_split': 2}
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2

