In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# ✅ Loading the train data set
file_path = '/content/drive/My Drive/titanic/data/internal_train_data_1226_encoded.csv'
train = pd.read_csv(file_path, low_memory=False)

train.info()


In [None]:
! pip install optuna

### Apply Optuna and train with GradientBoost

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import optuna

# Loading the data set
internal_train_data_path = '/content/drive/My Drive/titanic/data/internal_train_data_1226_encoded.csv'
train = pd.read_csv(internal_train_data_path, low_memory=False)

# Specifying features and target variable
target = 'Survived'
X_train = train.drop(columns=[target])
y_train = train[target]


# Specifying categorical variables
categorical_columns = ['Pclass', 'Embarked', 'Mapped_Title',
                       'TicketPrefix', 'TicketNumber', 'TicketNumberLengthGroup',
                       'TicketNumberPrefix', 'Mapped_Sex', 'AgeGroup',
                       'Age_Pclass_Group', 'FareGroup', 'FamilySizeGroup']

# Define the objective function for Optuna
def objective(trial):
    # Specify the range of hyperparameters
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20)
    }

    # Set up cross-validation
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=31)
    accuracy_scores = []

    for train_index, val_index in kf.split(X_train, y_train):
        X_cv_train, X_cv_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_cv_train, y_cv_val = y_train.iloc[train_index], y_train.iloc[val_index]

        # Train the model
        model = GradientBoostingClassifier(**param, random_state=42)
        model.fit(X_cv_train, y_cv_train)

        # Make predictions and calculate accuracy
        y_pred = model.predict(X_cv_val)
        accuracy = accuracy_score(y_cv_val, y_pred)
        accuracy_scores.append(accuracy)

    # Return the average accuracy from cross-validation
    return np.mean(accuracy_scores)

#  Execute Optuna optimization
study = optuna.create_study(direction='maximize')  # Maximize accuracy
study.optimize(objective, n_trials=50)

# Display the optimal parameters and score
print(f"Best Accuracy Score: {study.best_value}")
print(f"Best Parameters: {study.best_params}")


### Apply Optuna and train with GradientBoost

In [None]:
import joblib
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Loading the dataset
internal_train_data_path = '/content/drive/My Drive/titanic/data/internal_train_data_1226_encoded.csv'
internal_valid_data_path = '/content/drive/My Drive/titanic/data/internal_valid_data_1226_encoded.csv'

train = pd.read_csv(internal_train_data_path, low_memory=False)
valid = pd.read_csv(internal_valid_data_path, low_memory=False)

# Specifying categorical variables
categorical_columns = ['Pclass', 'Embarked', 'Mapped_Title',
                       'TicketPrefix', 'TicketNumber', 'TicketNumberLengthGroup',
                       'TicketNumberPrefix', 'Mapped_Sex', 'AgeGroup',
                       'Age_Pclass_Group', 'FareGroup', 'FamilySizeGroup']

train[categorical_columns] = train[categorical_columns].astype('category')
valid[categorical_columns] = valid[categorical_columns].astype('category')

# Specifying features and target variable
target = 'Survived'
X_train = train.drop(columns=[target])
y_train = train[target]
X_valid = valid.drop(columns=[target])
y_valid = valid[target]

# Specify the range of hyperparameters
best_params = {
    'n_estimators': 513,
    'max_depth': 3,
    'learning_rate': 0.02552173769133039,
    'subsample': 0.8806867411520856,
    'min_samples_split': 4,
    'min_samples_leaf': 1,
    'random_state': 31
}

# Train the model
print("Training the model with the optimal parameters...")
model = GradientBoostingClassifier(**best_params)
model.fit(X_train, y_train)

# Save the model
model_path = '/content/drive/My Drive/titanic/model/gradient_0129_2.pkl'
joblib.dump(model, model_path)
print(f"Model has been saved to: {model_path}")

# Evaluate the performance on the validation data
y_valid_pred = model.predict(X_valid)
accuracy = accuracy_score(y_valid, y_valid_pred)
print(f"Validation Accuracy on internal_valid_data: {accuracy:.4f}")


Training the model with the optimal parameters...
Model has been saved to: /content/drive/My Drive/titanic/model/gradient_0129_2.pkl
Validation Accuracy on internal_valid_data: 0.8380


In [None]:
print(train.equals(valid))



False


### Make predictions on the actual test data

In [None]:
import numpy as np
import pandas as pd
import joblib
from sklearn.ensemble import GradientBoostingClassifier

# Loading the model
gb_model_path = '/content/drive/My Drive/titanic/model/gradient_0129_2.pkl'
gb_model = joblib.load(gb_model_path)

# Loading the test data set
test_path = '/content/drive/My Drive/titanic/data/test_1226_encoded.csv'
test = pd.read_csv(test_path)

# Specifying categorical variables
categorical_columns = ['Pclass', 'Embarked', 'Mapped_Title',
                       'TicketPrefix', 'TicketNumber', 'TicketNumberLengthGroup',
                       'TicketNumberPrefix', 'Mapped_Sex', 'AgeGroup',
                       'Age_Pclass_Group', 'FareGroup', 'FamilySizeGroup']

# Keep PassengerId (for CSV output)
if 'PassengerId' not in test.columns:
    raise ValueError("Column 'PassengerId' is missing in the test dataset.")
passenger_ids = test['PassengerId']

# Remove PassengerId
test = test.drop(columns=['PassengerId'])

# # Make predictions directly with Gradient Boosting
gb_pred = gb_model.predict_proba(test)[:, 1]

# Convert probabilities to 0 or 1 using a threshold of 0.5 (Survived or Not Survived)
final_test_predictions = (gb_pred > 0.5).astype(int)

# Save the file
submission = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': final_test_predictions
})

submission_path = '/content/drive/My Drive/titanic/submission/gb_submission0129_2.csv'
submission.to_csv(submission_path, index=False)

print(f"\nFinal test predictions have been successfully saved to: {submission_path}")


In [None]:
# Calculate the proportion of survivors and deceased
survival_ratio = submission['Survived'].value_counts(normalize=True) * 100

# Display the results
survival_ratio

Unnamed: 0_level_0,proportion
Survived,Unnamed: 1_level_1
0,62.679426
1,37.320574
