In [96]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

In [97]:
df = pd.read_csv("data.csv")

### Encoding the categorical variables 

In [98]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cat_cols = ["gender", "location", "payment_status", "product_category", "merchant_name"]

for col in cat_cols:
    df[col] = le.fit_transform(df[col])

In [99]:
df

Unnamed: 0,gender,age,location,purchase_amount,down_payment,installment_amount,installments,payment_status,product_category,merchant_name,customer_income,credit_score,late_payments,fraud_flag
0,1,33,6,1462.06,167.00,177.87,1,0,4,7,2554.81,371,2,0
1,2,28,2,842.93,72.07,117.19,2,0,4,0,4656.38,411,1,0
2,2,45,6,781.59,244.76,210.26,8,2,1,6,5317.14,799,0,0
3,1,63,5,1007.49,159.57,268.36,1,0,4,3,4290.49,754,1,0
4,0,51,4,1915.33,362.62,30.25,11,0,5,5,2233.52,453,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,1,32,1,1768.58,425.50,174.31,7,1,2,5,7473.97,741,0,0
19996,0,30,0,946.71,196.16,237.81,7,2,3,4,7064.35,469,1,0
19997,2,23,1,1112.73,146.51,53.47,9,0,5,5,5562.88,332,1,1
19998,1,40,5,1055.47,359.62,105.07,8,0,1,7,6865.42,376,0,0


### Handelling data imbalance with oversampling:
Undersampling did not provide enough records for the model to learn effectively. Since the correlation between features is relatively low, oversampling was better suited for this dataset.

In [100]:
df_majority = df[df['fraud_flag'] == 0] 
df_minority = df[df['fraud_flag'] == 1] 

In [101]:
df_minority_oversampled = resample(
    df_minority, 
    replace=True,
    n_samples=len(df_majority),
    random_state=42
)

df_balanced = pd.concat([df_majority, df_minority_oversampled])
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print(df_balanced['fraud_flag'].value_counts())


fraud_flag
1    19044
0    19044
Name: count, dtype: int64


In [102]:
X = df_balanced.drop(columns= "fraud_flag")
y = df_balanced["fraud_flag"]

### Train Test Split

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Scaling the Data

In [104]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression

In [105]:
model = LogisticRegression(max_iter=1000)  # increase iterations to avoid convergence warnings
model.fit(X_train_scaled, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [106]:
y_pred = model.predict(X_test_scaled)

In [107]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.5057757941716986

Confusion Matrix:
 [[1896 1913]
 [1852 1957]]

Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.50      0.50      3809
           1       0.51      0.51      0.51      3809

    accuracy                           0.51      7618
   macro avg       0.51      0.51      0.51      7618
weighted avg       0.51      0.51      0.51      7618



### The above model performed very poorly on the data which might be possible due to very less correlation in among the features

# Decision Tree

In [109]:
dt_model = DecisionTreeClassifier(criterion="gini", max_depth=5, random_state=42)
dt_model.fit(X_train_scaled, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [110]:
y_pred = dt_model.predict(X_test)



In [111]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.5093200315043318

Confusion Matrix:
 [[ 962 2847]
 [ 891 2918]]

Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.25      0.34      3809
           1       0.51      0.77      0.61      3809

    accuracy                           0.51      7618
   macro avg       0.51      0.51      0.47      7618
weighted avg       0.51      0.51      0.47      7618



### Trying hyperparameter tuning by grid search cv

In [112]:
dt = DecisionTreeClassifier(random_state=42)
param_grid = {
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": [3, 5, 7, 10, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": [None, "sqrt", "log2"]
}

grid_search = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid_search.fit(X_train_scaled, y_train)

print("Best Parameters:", grid_search.best_params_)

best_dt = grid_search.best_estimator_

Best Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2}


In [113]:
y_pred = best_dt.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.94      0.97      3809
           1       0.94      1.00      0.97      3809

    accuracy                           0.97      7618
   macro avg       0.97      0.97      0.97      7618
weighted avg       0.97      0.97      0.97      7618

