In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

In [3]:
import sys, os

sys.path.append(os.path.abspath(os.path.join("../scripts")))

In [4]:
root_path = os.path.abspath("..")
if root_path not in sys.path:
    sys.path.insert(0, root_path)

from data_preprocessing import DataPreprocessing

In [5]:
my_data = DataPreprocessing()

In [6]:
my_data.read_data("../data/processed_data/synthetic_product_listings_gpt_4o_mini.csv")

Unnamed: 0,product_id,category,brand,condition,price,seller_reputation,description,suspected_fraud,log_price
0,2,Clothing,Adidas,used,75.00,4,Brand new Adidas running shorts,True,4.330733
1,3,Electronics,Apple,used,1299.99,2,Brand new Apple iPhone 14 Pro with 256GB storage,False,7.170881
2,4,Furniture,Joybird,used,1200.00,5,Stylish Joybird mid-century modern sofa with p...,False,7.090910
3,5,Furniture,IKEA,New,350.00,4,Brand new IKEA Hemnes dresser with 6 drawers.,False,5.860786
4,6,Furniture,West Elm,New,3200.00,3,Brand new West Elm mid-century dining table in...,True,8.071219
...,...,...,...,...,...,...,...,...,...
990,996,Furniture,West Elm,used,1750.00,3,Brand new West Elm mid-century style dining ta...,False,7.467942
991,997,Electronics,LG,New,1200.00,1,Brand new LG OLED55CXPUA Alexa Built-In OLED TV,False,7.090910
992,998,Clothing,Adidas,New,75.00,5,"Brand new Adidas Ultraboost running shoes, per...",True,4.330733
993,999,Electronics,Dell,New,1200.00,3,Brand new Dell XPS 13 laptop with 16GB RAM and...,False,7.090910


In [7]:
my_data.data.columns

Index(['product_id', 'category', 'brand', 'condition', 'price',
       'seller_reputation', 'description', 'suspected_fraud', 'log_price'],
      dtype='object')

In [8]:
my_data.drop_columns(columns=["product_id", "price", "description"])

Unnamed: 0,category,brand,condition,seller_reputation,suspected_fraud,log_price
0,Clothing,Adidas,used,4,True,4.330733
1,Electronics,Apple,used,2,False,7.170881
2,Furniture,Joybird,used,5,False,7.090910
3,Furniture,IKEA,New,4,False,5.860786
4,Furniture,West Elm,New,3,True,8.071219
...,...,...,...,...,...,...
990,Furniture,West Elm,used,3,False,7.467942
991,Electronics,LG,New,1,False,7.090910
992,Clothing,Adidas,New,5,True,4.330733
993,Electronics,Dell,New,3,False,7.090910


In [9]:
my_data.data.head()

Unnamed: 0,category,brand,condition,seller_reputation,suspected_fraud,log_price
0,Clothing,Adidas,used,4,True,4.330733
1,Electronics,Apple,used,2,False,7.170881
2,Furniture,Joybird,used,5,False,7.09091
3,Furniture,IKEA,New,4,False,5.860786
4,Furniture,West Elm,New,3,True,8.071219


In [10]:
col = ["category", "brand", "condition", "suspected_fraud"]

In [11]:
my_data.label_encoding(col)

Unnamed: 0,category,brand,condition,seller_reputation,suspected_fraud,log_price
0,0,0,1,4,True,4.330733
1,1,1,1,2,False,7.170881
2,2,10,1,5,False,7.090910
3,2,8,0,4,False,5.860786
4,2,32,0,3,True,8.071219
...,...,...,...,...,...,...
990,2,32,1,3,False,7.467942
991,1,12,0,1,False,7.090910
992,0,0,0,5,True,4.330733
993,1,5,0,3,False,7.090910


In [15]:
col = ["suspected_fraud"]
my_data.label_encoding(col)

Unnamed: 0,category,brand,condition,seller_reputation,suspected_fraud,log_price
0,0,0,1,4,True,4.330733
1,1,1,1,2,False,7.170881
2,2,10,1,5,False,7.090910
3,2,8,0,4,False,5.860786
4,2,32,0,3,True,8.071219
...,...,...,...,...,...,...
990,2,32,1,3,False,7.467942
991,1,12,0,1,False,7.090910
992,0,0,0,5,True,4.330733
993,1,5,0,3,False,7.090910


In [17]:
data = my_data.data

In [None]:
fraud_map = {True: 1, False: 0}
for col in data.columns:
    if col == "suspected_fraud":
        data[col] = data[col].map(fraud_map)

In [19]:
data.head()

Unnamed: 0,category,brand,condition,seller_reputation,suspected_fraud,log_price
0,0,0,1,4,1,4.330733
1,1,1,1,2,0,7.170881
2,2,10,1,5,0,7.09091
3,2,8,0,4,0,5.860786
4,2,32,0,3,1,8.071219


In [20]:
data.to_csv("../data/labeled_data/fraud_encoded_labeled.csv", index=False)

In [4]:
data = pd.read_csv("../data/labeled_data/fraud_encoded_labeled.csv")
data.head()

Unnamed: 0,category,brand,condition,seller_reputation,suspected_fraud,log_price
0,0,0,1,4,1,4.330733
1,1,1,1,2,0,7.170881
2,2,10,1,5,0,7.09091
3,2,8,0,4,0,5.860786
4,2,32,0,3,1,8.071219


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Assume 'Class' column contains 0 (legit) and 1 (fraud), and other columns are features
X = data.drop(columns=["suspected_fraud"])
y = data["suspected_fraud"]


# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Standardize features (important for logistic regression)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.6985
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
import numpy as np

fraud_ratio = np.sum(y) / len(y)
print(f"Fraud cases: {np.sum(y)}, Legit cases: {len(y) - np.sum(y)}")
print(f"Fraud ratio: {fraud_ratio:.4f}")

Fraud cases: 299, Legit cases: 696
Fraud ratio: 0.3005


In [7]:
import numpy as np

y_pred = model.predict(X_test)
print("Unique predictions:", np.unique(y_pred, return_counts=True))

Unique predictions: (array([0]), array([199]))


In [9]:
y_prob = model.predict_proba(X_test)[:, 1]  # Get fraud probabilities
threshold = 0.3  # Instead of default 0.5
y_pred = (y_prob > threshold).astype(int)
print("Unique predictions with threshold:", np.unique(y_pred, return_counts=True))

Unique predictions with threshold: (array([0, 1]), array([109,  90]))


In [10]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(class_weight="balanced", random_state=42)
model.fit(X_train, y_train)

In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

In [20]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_auc_score,
)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=1)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.5427
Precision: 0.2817
Recall: 0.3333
F1 Score: 0.3053
Confusion Matrix:
[[88 51]
 [40 20]]


In [21]:
from xgboost import XGBClassifier

model = XGBClassifier(scale_pos_weight=2, eval_metric="logloss", random_state=42)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)


# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=1)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.5427
Precision: 0.2817
Recall: 0.3333
F1 Score: 0.3053
ROC-AUC Score: 0.4832
Confusion Matrix:
[[88 51]
 [40 20]]


In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
)
from imblearn.over_sampling import SMOTE

In [6]:
# Assuming X, y are already defined (feature matrix and target variable)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
catboost_model = CatBoostClassifier(
    iterations=500,
    depth=6,
    learning_rate=0.05,
    l2_leaf_reg=3,
    loss_function="Logloss",
    eval_metric="F1",
    random_seed=42,
    verbose=100,
)

catboost_model.fit(
    X_train_resampled,
    y_train_resampled,
    eval_set=(X_test, y_test),
    early_stopping_rounds=50,
)

0:	learn: 0.6499596	test: 0.4157303	best: 0.4157303 (0)	total: 68.6ms	remaining: 34.3s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.4197530864
bestIteration = 12

Shrink model to first 13 iterations.


<catboost.core.CatBoostClassifier at 0x282369d50>

In [8]:
y_pred = catboost_model.predict(X_test)

# Compute evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, catboost_model.predict_proba(X_test)[:, 1])
conf_matrix = confusion_matrix(y_test, y_pred)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")
print(f"Confusion Matrix:\n{conf_matrix}")

Accuracy: 0.5276
Precision: 0.3333
Recall: 0.5667
F1 Score: 0.4198
ROC-AUC Score: 0.5380
Confusion Matrix:
[[71 68]
 [26 34]]


In [9]:
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Parameter grid for CatBoost
param_dist = {
    "iterations": [100, 200, 500, 1000],  # Number of trees
    "learning_rate": [0.01, 0.05, 0.1, 0.3],  # Step size at each iteration
    "depth": [4, 6, 8, 10],  # Depth of trees
    "l2_leaf_reg": [1, 3, 5, 10],  # Regularization parameter
    "border_count": [32, 64, 128],  # Number of splits for numerical features
    "bagging_temperature": [0, 0.2, 0.5, 1],  # Controls bagging
}

# Initialize CatBoostClassifier
catboost_model = CatBoostClassifier(silent=True)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    catboost_model,
    param_distributions=param_dist,
    n_iter=10,  # Number of parameter settings to sample
    cv=3,  # Cross-validation folds
    verbose=3,
    random_state=42,
    n_jobs=-1,
)

# Fit RandomizedSearchCV with your data (X_train, y_train)
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print("Best hyperparameters found: ", best_params)

# Evaluate the best model
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

# Print evaluation metrics
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 3/3] END bagging_temperature=0.2, border_count=128, depth=4, iterations=100, l2_leaf_reg=10, learning_rate=0.1;, score=0.698 total time=   0.1s
[CV 2/3] END bagging_temperature=0.2, border_count=128, depth=4, iterations=100, l2_leaf_reg=10, learning_rate=0.1;, score=0.683 total time=   0.1s
[CV 1/3] END bagging_temperature=0.2, border_count=128, depth=4, iterations=100, l2_leaf_reg=10, learning_rate=0.1;, score=0.699 total time=   0.1s
[CV 1/3] END bagging_temperature=0.2, border_count=64, depth=6, iterations=100, l2_leaf_reg=3, learning_rate=0.3;, score=0.654 total time=   0.0s
[CV 2/3] END bagging_temperature=0.2, border_count=64, depth=6, iterations=100, l2_leaf_reg=3, learning_rate=0.3;, score=0.657 total time=   0.0s
[CV 3/3] END bagging_temperature=0.2, border_count=32, depth=6, iterations=200, l2_leaf_reg=10, learning_rate=0.01;, score=0.698 total time=   0.1s
[CV 2/3] END bagging_temperature=0.2, border_count=32, 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
catboost_model = CatBoostClassifier(
    learning_rate=0.01,
    l2_leaf_reg=10,
    iterations=200,
    depth=6,
    border_count=32,
    bagging_temperature=0.2,
    class_weights=[1, 5],
)