In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

## Prediction with XGboositing model

In [None]:
df = pd.read_csv("data/turtles_cleaned.csv")

In [None]:


# ------------------------------------------------------------
# 1. Load the dataset
# ------------------------------------------------------------
  # <-- replace with your file name

target = "CaptureSiteCategory"
X = df.drop(columns=[target])
y = df[target]

# ------------------------------------------------------------
# 2. Encode target labels into integers
# ------------------------------------------------------------
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# ------------------------------------------------------------
# 3. Identify numeric & categorical columns
# ------------------------------------------------------------
numeric_cols = X.select_dtypes(include=["float64", "int64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

# ------------------------------------------------------------
# 4. Preprocessing pipelines
# ------------------------------------------------------------
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

# ------------------------------------------------------------
# 5. XGBoost model inside a pipeline
# ------------------------------------------------------------
model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softprob",
    eval_metric="mlogloss",
    tree_method="hist",
    random_state=42
)

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

# ------------------------------------------------------------
# 6. Train-test split
# ------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)

# ------------------------------------------------------------
# 7. Train model
# ------------------------------------------------------------
clf.fit(X_train, y_train)

# ------------------------------------------------------------
# 8. Predictions & evaluation
# ------------------------------------------------------------
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# ------------------------------------------------------------
# 9. Example prediction on new data
# ------------------------------------------------------------
example = pd.DataFrame([{
    "CaptureSite": "CaptureSite_0",
    "ForagingGround": "Ocean",
    "Species": "Species_6",
    "CCL_cm": 60.0,
    "CCW_cm": 58.0,
    "Weight_Kg": 25.0,
    "CalendarWeek": 32
}])

pred_encoded = clf.predict(example)
pred_label = le.inverse_transform(pred_encoded)

print("Predicted CaptureSiteCategory:", pred_label[0])


Accuracy: 0.9994464433988375

Classification Report:

                       precision    recall  f1-score   support

CaptureSiteCategory_0       1.00      1.00      1.00      1694
CaptureSiteCategory_1       1.00      1.00      1.00       463
CaptureSiteCategory_2       1.00      1.00      1.00       852
CaptureSiteCategory_3       1.00      1.00      1.00        15
CaptureSiteCategory_4       1.00      1.00      1.00       589

             accuracy                           1.00      3613
            macro avg       1.00      1.00      1.00      3613
         weighted avg       1.00      1.00      1.00      3613

Predicted CaptureSiteCategory: CaptureSiteCategory_2


# Feature leakage evaluation 

In [15]:
df.groupby("CaptureSiteCategory").nunique()


Unnamed: 0_level_0,CaptureSite,ForagingGround,Species,CCL_cm,CCW_cm,Weight_Kg,CalendarWeek
CaptureSiteCategory,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CaptureSiteCategory_0,8,2,8,880,819,1414,53
CaptureSiteCategory_1,3,2,4,642,609,833,53
CaptureSiteCategory_2,10,2,5,755,733,1092,53
CaptureSiteCategory_3,1,2,3,70,70,43,41
CaptureSiteCategory_4,7,2,3,598,563,805,53


#  Removing the capturesiet to reduce the feature leacakge 

In [None]:


# ------------------------------------------------------------
df = pd.read_csv("data/turtles_cleaned.csv")   # <-- replace with your actual file path

# ------------------------------------------------------------
# 2. Define target and remove ONLY CaptureSite
# ------------------------------------------------------------
target = "CaptureSiteCategory"

X = df.drop(columns=["CaptureSite", target])   # <-- remove ONLY CaptureSite
y = df[target]

# ------------------------------------------------------------
# 3. Encode the target
# ------------------------------------------------------------
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# ------------------------------------------------------------
# 4. Detect numeric and categorical columns
# ------------------------------------------------------------
numeric_cols = X.select_dtypes(include=["float64", "int64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

# ------------------------------------------------------------
# 5. Preprocessing for numeric + categorical features
# ------------------------------------------------------------
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

# ------------------------------------------------------------
# 6. XGBoost model
# ------------------------------------------------------------
model = XGBClassifier(
    n_estimators=250,
    learning_rate=0.08,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softprob",
    eval_metric="mlogloss",
    tree_method="hist",
    random_state=42
)

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

# ------------------------------------------------------------
# 7. Train-test split
# ------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)

# ------------------------------------------------------------
# 8. Train model
# ------------------------------------------------------------
clf.fit(X_train, y_train)

# ------------------------------------------------------------
# 9. Evaluate
# ------------------------------------------------------------
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=le.classes_))


Accuracy: 0.6983116523664544

Classification Report:

                       precision    recall  f1-score   support

CaptureSiteCategory_0       0.74      0.98      0.84      1694
CaptureSiteCategory_1       0.50      0.39      0.44       463
CaptureSiteCategory_2       0.70      0.79      0.74       852
CaptureSiteCategory_3       0.00      0.00      0.00        15
CaptureSiteCategory_4       0.30      0.02      0.04       589

             accuracy                           0.70      3613
            macro avg       0.45      0.44      0.41      3613
         weighted avg       0.62      0.70      0.63      3613

