In [1]:
import pandas as pd
import numpy as np

## Load Data 

In [2]:
df_train = pd.read_csv("/kaggle/input/isic-2024-challenge/train-metadata.csv")
df_test = pd.read_csv("/kaggle/input/isic-2024-challenge/test-metadata.csv")

  df_train = pd.read_csv("/kaggle/input/isic-2024-challenge/train-metadata.csv")


### Feature Eng.

In [3]:
# Define feature engineering function
def feature_engineering(df):
    df["lesion_size_ratio"] = df["tbp_lv_minorAxisMM"] / df["clin_size_long_diam_mm"]
    df["lesion_shape_index"] = df["tbp_lv_areaMM2"] / (df["tbp_lv_perimeterMM"] ** 2)
    df["hue_contrast"] = (df["tbp_lv_H"] - df["tbp_lv_Hext"]).abs()
    df["luminance_contrast"] = (df["tbp_lv_L"] - df["tbp_lv_Lext"]).abs()
    df["lesion_color_difference"] = np.sqrt(df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2)
    df["border_complexity"] = df["tbp_lv_norm_border"] + df["tbp_lv_symm_2axis"]
    df["color_uniformity"] = df["tbp_lv_color_std_mean"] / df["tbp_lv_radial_color_std_max"]
    df["3d_position_distance"] = np.sqrt(df["tbp_lv_x"] ** 2 + df["tbp_lv_y"] ** 2 + df["tbp_lv_z"] ** 2)
    df["perimeter_to_area_ratio"] = df["tbp_lv_perimeterMM"] / df["tbp_lv_areaMM2"]
    df["lesion_visibility_score"] = df["tbp_lv_deltaLBnorm"] + df["tbp_lv_norm_color"]
    df["combined_anatomical_site"] = df["anatom_site_general"] + "_" + df["tbp_lv_location"]
    df["symmetry_border_consistency"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_norm_border"]
    df["color_consistency"] = df["tbp_lv_stdL"] / df["tbp_lv_Lext"]
    df["size_age_interaction"] = df["clin_size_long_diam_mm"] * df["age_approx"]
    df["hue_color_std_interaction"] = df["tbp_lv_H"] * df["tbp_lv_color_std_mean"]
    df["lesion_severity_index"] = (df["tbp_lv_norm_border"] + df["tbp_lv_norm_color"] + df["tbp_lv_eccentricity"]) / 3
    df["shape_complexity_index"] = df["border_complexity"] + df["lesion_shape_index"]
    df["color_contrast_index"] = df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"] + df["tbp_lv_deltaLBnorm"]
    df["log_lesion_area"] = np.log(df["tbp_lv_areaMM2"] + 1)
    df["normalized_lesion_size"] = df["clin_size_long_diam_mm"] / df["age_approx"]
    df["mean_hue_difference"] = (df["tbp_lv_H"] + df["tbp_lv_Hext"]) / 2
    df["std_dev_contrast"] = np.sqrt((df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2) / 3)
    df["color_shape_composite_index"] = (df["tbp_lv_color_std_mean"] + df["tbp_lv_area_perim_ratio"] + df["tbp_lv_symm_2axis"]) / 3
    df["3d_lesion_orientation"] = np.arctan2(df["tbp_lv_y"], df["tbp_lv_x"])
    df["overall_color_difference"] = (df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"]) / 3
    df["symmetry_perimeter_interaction"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_perimeterMM"]
    df["comprehensive_lesion_index"] = (df["tbp_lv_area_perim_ratio"] + df["tbp_lv_eccentricity"] + df["tbp_lv_norm_color"] + df["tbp_lv_symm_2axis"]) / 4

    new_num_cols = [
        "lesion_size_ratio", "lesion_shape_index", "hue_contrast",
        "luminance_contrast", "lesion_color_difference", "border_complexity",
        "color_uniformity", "3d_position_distance", "perimeter_to_area_ratio",
        "lesion_visibility_score", "symmetry_border_consistency", "color_consistency",
        "size_age_interaction", "hue_color_std_interaction", "lesion_severity_index",
        "shape_complexity_index", "color_contrast_index", "log_lesion_area",
        "normalized_lesion_size", "mean_hue_difference", "std_dev_contrast",
        "color_shape_composite_index", "3d_lesion_orientation", "overall_color_difference",
        "symmetry_perimeter_interaction", "comprehensive_lesion_index",
    ]
    new_cat_cols = ["combined_anatomical_site"]
    return df, new_num_cols, new_cat_cols



In [4]:
from sklearn.preprocessing import OrdinalEncoder

# Apply feature engineering
df_train, new_num_cols, new_cat_cols = feature_engineering(df_train.copy())
df_test, _, _ = feature_engineering(df_test.copy())

# Define numerical columns, including new features
num_cols = [
    'age_approx', 'clin_size_long_diam_mm', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext', 
    'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L', 
    'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio', 'tbp_lv_color_std_mean', 
    'tbp_lv_deltaA', 'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLB',
    'tbp_lv_deltaLBnorm', 'tbp_lv_eccentricity', 'tbp_lv_minorAxisMM',
    'tbp_lv_nevi_confidence', 'tbp_lv_norm_border', 'tbp_lv_norm_color',
    'tbp_lv_perimeterMM', 'tbp_lv_radial_color_std_max', 'tbp_lv_stdL',
    'tbp_lv_stdLExt', 'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle',
    'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z',
] + new_num_cols

# Define categorical columns, including new features
cat_cols = ["sex", "tbp_tile_type", "tbp_lv_location", "tbp_lv_location_simple"] + new_cat_cols

# Combine all feature columns
train_cols = num_cols + cat_cols

# Encode categorical features
category_encoder = OrdinalEncoder(
    categories='auto',
    dtype=int,
    handle_unknown='use_encoded_value',
    unknown_value=-2,
    encoded_missing_value=-1,
)

X_cat = category_encoder.fit_transform(df_train[cat_cols])
for c, cat_col in enumerate(cat_cols):
    df_train[cat_col] = X_cat[:, c]

# Repeat for test set
X_cat_test = category_encoder.transform(df_test[cat_cols])
for c, cat_col in enumerate(cat_cols):
    df_test[cat_col] = X_cat_test[:, c]


In [5]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold

import lightgbm as lgb

#### Custom Training Metric


In [6]:
from sklearn.metrics import make_scorer, roc_auc_score

# Custom scoring function
def comp_score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str = '', min_tpr: float = 0.80):
    v_gt = abs(np.asarray(solution.values) - 1)
    v_pred = np.array([1.0 - x for x in submission.values])
    max_fpr = abs(1 - min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    return partial_auc

# Wrapper function to be used with make_scorer
def custom_scorer(y_true, y_pred):
    return comp_score(pd.DataFrame(y_true), pd.DataFrame(y_pred))

# Create the scorer
scorer = make_scorer(custom_scorer, greater_is_better=True)


### Main Training Loop

In [7]:
from sklearn.model_selection import GroupKFold, GridSearchCV
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV

# # Define the parameter distribution
# param_dist = {
#     'n_estimators': [500, 1000],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'num_leaves': [31, 50, 100],
#     'feature_fraction': [0.8, 0.9, 1.0],
#     'bagging_fraction': [0.8, 0.9, 1.0],
#     'lambda_l1': [0.0, 0.8, 1.0],
#     'lambda_l2': [0.0, 0.8, 1.0]
# }

param_dist = {
    'n_estimators': [500],
    'learning_rate': [0.01],
    'num_leaves': [100],
    'feature_fraction': [1.0],
    'bagging_fraction': [1.0],
    'lambda_l1': [0.8],
    'lambda_l2': [0.8]
}


# Initialize the model
model = lgb.LGBMRegressor(objective='binary', random_state=42)

# GroupKFold Cross-Validation
gkf = GroupKFold(n_splits=5)
df_train["fold"] = -1
for idx, (train_idx, val_idx) in enumerate(gkf.split(df_train, df_train["target"], groups=df_train["patient_id"])):
    df_train.loc[val_idx, "fold"] = idx

# Perform randomized search with cross-validation, passing the groups parameter
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=50, cv=gkf, scoring=scorer, verbose=1, n_jobs=-1)
random_search.fit(df_train[train_cols], df_train["target"], groups=df_train["patient_id"])

# Print the best parameters and score
print("Best parameters found: ", random_search.best_params_)
print("Best cross-validation score: ", random_search.best_score_)

# Train the final model with the best parameters
best_params = random_search.best_params_
final_model = lgb.LGBMRegressor(**best_params, objective='binary', random_state=42)
final_model.fit(df_train[train_cols], df_train["target"])





Fitting 5 folds for each of 1 candidates, totalling 5 fits
[LightGBM] [Info] Number of positive: 393, number of negative: 400666
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.330421 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14898
[LightGBM] [Info] Number of data points in the train set: 401059, number of used features: 65
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000980 -> initscore=-6.927074
[LightGBM] [Info] Start training from score -6.927074
Best parameters found:  {'num_leaves': 100, 'n_estimators': 500, 'learning_rate': 0.01, 'lambda_l2': 0.8, 'lambda_l1': 0.8, 'feature_fraction': 1.0, 'bagging_fraction': 1.0}
Best cross-validation score:  0.15500282932198106
[LightGBM] [Info] Number of positive: 393, number of negative: 400666
[LightGBM] [Info] Number of positive: 313, number of negative: 320535
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing 

In [8]:
# If we want to create a validation dataset:
# # Make sure df_test has the target column if it doesn't exist
# # Assuming df_test_target is available with true target values for the test set
# df_test['target'] = df_test_target  # Replace df_test_target with actual target values for df_test

# # Validate the final model
# final_preds = final_model.predict(df_test[train_cols])
# final_score = comp_score(df_test[["target"]], pd.DataFrame(final_preds, columns=["prediction"]))
# print(f"Final Partial AUC Score: {final_score:.5f}")



In [9]:
# Save the final model
import joblib
joblib.dump(final_model, 'final_model.pkl')
print("Model saved as final_model.pkl")

Model saved as final_model.pkl


In [10]:
# Load the saved model
model_path = 'final_model.pkl'
final_model = joblib.load(model_path)
print("Model loaded from final_model.pkl")

# Load the sample submission file
submission_path = '/kaggle/input/isic-2024-challenge/sample_submission.csv'
submission = pd.read_csv(submission_path)



Model loaded from final_model.pkl


In [11]:
# Extract 'isic_id' from the sample submission to align with df_test
isic_ids = submission['isic_id']

# Filter df_test to include only the rows with the 'isic_id' present in the submission
df_test = df_test[df_test['isic_id'].isin(isic_ids)]

# Ensure the order of df_test matches the order of isic_ids in the submission
df_test = df_test.set_index('isic_id').loc[isic_ids].reset_index()

# Drop the 'isic_id' column for predictions
df_test_features = df_test[train_cols]

# Make predictions
predictions = final_model.predict(df_test_features)

# Save the predictions in the required format
submission['target'] = predictions
submission.to_csv('submission.csv', index=False)

print("Predictions saved to submission.csv")

Predictions saved to submission.csv


In [12]:
submission

Unnamed: 0,isic_id,target
0,ISIC_0015657,6.4e-05
1,ISIC_0015729,2.6e-05
2,ISIC_0015740,8.7e-05
