In [5]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

In [2]:
data = pd.read_csv("../data/labeled_data/fraud_encoded_labeled.csv")

In [3]:
data.head()

Unnamed: 0,category,brand,condition,seller_reputation,suspected_fraud,log_price
0,0,0,1,4,1,4.330733
1,1,1,1,2,0,7.170881
2,2,10,1,5,0,7.09091
3,2,8,0,4,0,5.860786
4,2,32,0,3,1,8.071219


In [6]:
def split_and_prepare_data(
    df: DataFrame, target_column: str, test_size: float = 0.2, random_state: int = 1
):
    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input data must be a pandas DataFrame.")

    if target_column not in df.columns:
        raise ValueError(
            f"The target column '{target_column}' is not in the DataFrame."
        )

    # Split the dataset into training and testing sets
    X_train, X_test = train_test_split(
        df, test_size=test_size, random_state=random_state
    )

    # Reset the index for both splits
    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)

    # Extract target values
    y_train = X_train[target_column].values
    y_test = X_test[target_column].values

    # Remove the target column from features
    X_train = X_train.drop(columns=[target_column])
    X_test = X_test.drop(columns=[target_column])

    return X_train, X_test, y_train, y_test

In [7]:
X_train, X_test, y_train, y_test = split_and_prepare_data(
    data, target_column="suspected_fraud"
)

In [9]:
np.shape(X_train), np.shape(X_test), np.shape(y_train), np.shape(y_test)

((796, 5), (199, 5), (796,), (199,))

In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


def train_and_evaluate_decision_tree(X_train, y_train, X_test, y_test):
    """
    Train and evaluate a Decision Tree Classifier.

    Parameters:
        X_train (array-like): Training features
        y_train (array-like): Training labels
        X_test (array-like): Testing features
        y_test (array-like): Testing labels
        max_depth (int): Maximum depth of the decision tree

    Returns:
        dict: Trained model and evaluation metrics
    """

    model = DecisionTreeClassifier(
        max_depth=3, min_samples_leaf=1, min_samples_split=2, random_state=42
    )

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
    recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
    f1 = f1_score(y_test, y_pred, average="weighted")

    print(f"Accuracy: {accuracy:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"F1 Score: {f1:.3f}")

    return model


# Example usage:
result = train_and_evaluate_decision_tree(X_train, y_train, X_test, y_test)

Accuracy: 0.688
Precision: 0.546
Recall: 0.688
F1 Score: 0.590


In [16]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "max_depth": [3, 5, 10, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

grid_search = GridSearchCV(
    DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring="accuracy"
)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print(grid_search.best_params_)

{'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [23]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [3, 5, 10, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 5],
    "max_features": [None, "sqrt", "log2"],
}

# Create model and perform GridSearch
clf = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(clf, param_grid, scoring="f1", cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 2}


In [None]:
Best Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 2}

In [24]:
best_params = {
    "criterion": "entropy",
    "max_depth": None,
    "max_features": None,
    "min_samples_leaf": 5,
    "min_samples_split": 2,
}

In [25]:
from sklearn.tree import DecisionTreeClassifier

# Train model with best hyperparameters
best_model = DecisionTreeClassifier(**best_params, random_state=42)
best_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = best_model.predict(X_test)

# Evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

metrics = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "F1 Score": f1_score(y_test, y_pred),
}

print(metrics)

{'Accuracy': 0.6381909547738693, 'Precision': 0.2857142857142857, 'Recall': 0.17543859649122806, 'F1 Score': 0.21739130434782608}


In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [5, 10, 20, None],
    "max_features": [None, "sqrt", "log2"],
    "min_samples_leaf": [1, 2, 5, 10],
    "min_samples_split": [2, 5, 10],
}

grid_search = GridSearchCV(
    DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring="f1"
)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

# Train with best parameters
best_model = DecisionTreeClassifier(**grid_search.best_params_, random_state=42)
best_model.fit(X_train, y_train)

# Evaluate
y_pred = best_model.predict(X_test)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

metrics = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "F1 Score": f1_score(y_test, y_pred),
}

print(metrics)

Best Parameters: {'criterion': 'entropy', 'max_depth': 20, 'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 2}
{'Accuracy': 0.6381909547738693, 'Precision': 0.2857142857142857, 'Recall': 0.17543859649122806, 'F1 Score': 0.21739130434782608}
