In [1]:
#import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#load train dataset
train_df = pd.read_csv('../data/train.csv')
train_df.drop(columns=['PassengerId'], inplace=True)

In [3]:
#separate features and target variable
TARGET = 'Survived'
X = train_df.drop(columns=[TARGET])
y = train_df[TARGET]

In [4]:
#split the data into training and validation sets
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
#create a pipeline with preprocessor (without scaling) and LightGBM classifier
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
import sys
sys.path.append('../')
from feature import PreProcessor

model1 = Pipeline(steps=[
    ('preprocessor', PreProcessor(scaling=False)),
    ('classifier', LGBMClassifier(
        objective="binary",
        boosting_type="gbdt",
        n_estimators=1000,          # let early stopping decide
        learning_rate=0.03,         # slow & stable
        max_depth=4,                # CRUCIAL for small data
        num_leaves=15,              # <= 2^max_depth
        
        min_child_samples=30,       # strong regularization
        min_child_weight=1e-3,
        
        subsample=0.8,
        colsample_bytree=0.8,
        
        reg_alpha=0.5,              # L1
        reg_lambda=1.0,             # L2
        
        random_state=42,
        n_jobs=-1
    ))
])

In [6]:
#train model1 using training data
model1.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000114 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 166
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838


0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('preprocessor', ...), ('classifier', ...)]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,scaling,False

0,1,2
,boosting_type,'gbdt'
,num_leaves,15
,max_depth,4
,learning_rate,0.03
,n_estimators,1000
,subsample_for_bin,200000
,objective,'binary'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [7]:
#check model1 accuracy on validation data
val_accuracy = model1.score(X_val, y_val)
print(f"Validation Accuracy of Model 1: {val_accuracy:.4f}")

Validation Accuracy of Model 1: 0.8547




In [8]:
#load test dataset
test_df = pd.read_csv('../data/test.csv')
test_passenger_ids = test_df['PassengerId']
test_df.drop(columns=['PassengerId'], inplace=True)

In [9]:
#predict on test data
test_predictions = model1.predict(test_df)



In [10]:
#save predictions to ../output/lightgbm_predictions.csv
output_df = pd.DataFrame({
    'PassengerId': test_passenger_ids,
    'Survived': test_predictions
})
output_df.to_csv('../output/lightgbm_predictions.csv', index=False)