In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Training part

In [None]:
# importing nece libraries
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [None]:
# Importing data w start preprocessing
train = pd.read_csv("/kaggle/input/pink-ai-breast-cancer-risk-prediction-challenge/train.csv")

X = train.drop("target", axis=1)
y = train["target"]

if "ID" in X.columns:
    X_ids = X["ID"]
    X = X.drop("ID", axis=1)


In [None]:
#feature engineering 
X = X.apply(pd.to_numeric, errors="coerce")

X["years_since_2005"] = X["feature_0"] - 2005  
X["recent_screening"] = (X["feature_0"] >= 2015).astype(int) 

X["age_risk_interaction"] = X["feature_2"] * X["feature_4"] 

X["high_risk_combo"] = (
    (X["feature_4"] >= 2) & 
    (X["feature_7"] == 1) &
    (X["feature_10"] == 1) 
).astype(int)


X["severity_stage"] = X["feature_5"] + X["feature_8"] 
X["advanced_case"] = ((X["feature_5"] >= 3) | (X["feature_8"] >= 2)).astype(int)

X["frequent_screener"] = (X["feature_9"] >= 3).astype(int) 
X["age_screening_match"] = X["feature_2"] * X["feature_9"] 

X["high_risk_category"] = (X["feature_1"] >= 10).astype(int)

print(f"Total features after engineering: {X.shape[1]}")


In [None]:
#handling missing flags & cleaning

missing_flags = X.isna().astype(int).add_suffix("_isna")
X = pd.concat([X, missing_flags], axis=1)


for col in X.columns:
    if X[col].dtype in ["int64","float64"]:
        X[col] = X[col].fillna(X[col].median())
    else:
        X[col] = X[col].astype(str).fillna("makanch")


In [None]:

categorical_columns = []
for col in ['feature_1', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_10', 'recent_screening', 'high_risk_combo', 'advanced_case', 'frequent_screener', 'high_risk_category']:
    if col in X.columns:
        categorical_columns.append(col)

print(f"Converting these to categorical: {categorical_columns}")

for col in categorical_columns:
    if col in X.columns:
        X[col] = X[col].astype('category')


In [None]:
# scaling

scaler = StandardScaler()
X["feature_0"] = scaler.fit_transform(X[["feature_0"]])


In [None]:
cat_mappings = {}
categorical_columns_final = X.select_dtypes(include=['category']).columns
for c in categorical_columns_final:
    cat_mappings[c] = X[c].cat.categories.tolist()

print(f"Final categorical features: {list(categorical_columns_final)}")

assert len(X) == len(y)


In [None]:
# splitting training & testing data

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [None]:
# model training

dtrain = lgb.Dataset(X_train, label=y_train, categorical_feature=list(categorical_columns_final))
dval = lgb.Dataset(X_val, label=y_val, categorical_feature=list(categorical_columns_final))


#mod param

params = {
    "objective": "binary",
    "boosting_type": "gbdt",
    "metric": ["auc", "binary_logloss"],
    "num_leaves": 63,
    "learning_rate": 0.02,
    "feature_fraction": 0.7,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "min_child_samples": 20,
    "seed": 42,
    "verbose": -1,
}

model = lgb.train(
    params,
    dtrain,
    valid_sets=[dtrain, dval],
    num_boost_round=5000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(period=200)
    ]
)


In [None]:
# prediction and accuracy check

y_pred_proba = model.predict(X_val)
y_pred_class = (y_pred_proba > 0.5).astype(int)

print("\nValidation Metrics:")
print("AUC:", roc_auc_score(y_val, y_pred_proba))
print("Log Loss:", log_loss(y_val, y_pred_proba))
print("Accuracy:", accuracy_score(y_val, y_pred_class))

In [None]:
#important features plot

lgb.plot_importance(model, max_num_features=15, importance_type="gain", figsize=(8, 5))
plt.title("Top 15 Feature Importances")
plt.show()

print("done")

# Test data part

In [None]:
# importing the test dataset + applying same feature eng as the training set before

test = pd.read_csv("/kaggle/input/pink-ai-breast-cancer-risk-prediction-challenge/test.csv")
test_ids = test["ID"]
X_test = test.drop("ID", axis=1)

X_test = X_test.apply(pd.to_numeric, errors="coerce")

X_test["years_since_2005"] = X_test["feature_0"] - 2005
X_test["recent_screening"] = (X_test["feature_0"] >= 2015).astype(int)

X_test["age_risk_interaction"] = X_test["feature_2"] * X_test["feature_4"]

X_test["high_risk_combo"] = (
    (X_test["feature_4"] >= 2) & 
    (X_test["feature_7"] == 1) & 
    (X_test["feature_10"] == 1)
).astype(int)

X_test["severity_stage"] = X_test["feature_5"] + X_test["feature_8"]
X_test["advanced_case"] = ((X_test["feature_5"] >= 3) | (X_test["feature_8"] >= 2)).astype(int)

X_test["frequent_screener"] = (X_test["feature_9"] >= 3).astype(int)
X_test["age_screening_match"] = X_test["feature_2"] * X_test["feature_9"]

X_test["high_risk_category"] = (X_test["feature_1"] >= 10).astype(int)


In [None]:
missing_flags_test = X_test.isna().astype(int).add_suffix("_isna")
X_test = pd.concat([X_test, missing_flags_test], axis=1)

for col in X_test.columns:
    if X_test[col].dtype in ["int64","float64"]:
        X_test[col] = X_test[col].fillna(X_test[col].median())
    else:
        X_test[col] = X_test[col].astype(str).fillna("makanch")

for c in cat_mappings:
    if c in X_test.columns:
        X_test[c] = X_test[c].astype(str)
        unseen_categories = set(X_test[c]) - set(cat_mappings[c])
        if unseen_categories:
            print(f"Warning: Unseen categories in {c}: {unseen_categories}")
            X_test.loc[X_test[c].isin(unseen_categories), c] = "makanch"
        X_test[c] = pd.Categorical(X_test[c], categories=cat_mappings[c])

X_test["feature_0"] = scaler.transform(X_test[["feature_0"]])

missing_cols = set(X.columns) - set(X_test.columns)
if missing_cols:
    print(f"Adding missing columns: {missing_cols}")
    for col in missing_cols:
        X_test[col] = 0

X_test = X_test[X.columns]

In [None]:
# prediction 

test_pred_proba = model.predict(X_test)

In [2]:
#submission

submission = pd.DataFrame({
    "ID": test_ids,
    "target": test_pred_proba
})

submission.to_csv("submission.csv", index=False)
print("saved")
print(submission.head())

  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)


Submission file created successfully!
Test predictions shape: (297860,)
