In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

file_path = '/content/drive/My Drive/titanic/data/internal_train_data_1226_encoded.csv'
train = pd.read_csv(file_path, low_memory=False)

train.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Mapped_Title,FamilySize,TicketPrefix,...,TicketNumberLengthGroup,TicketNumberPrefix,Mapped_Sex,AgeGroup,Age_Pclass_Group,FareGroup,FamilySizeGroup,Age_Pclass,family_size_pclass_total,family_size_pclass_survived
0,0,1,45.5,0,0,28.5,2,0,1,35,...,3,68,0,2,6,3,0,45.5,109,0.536585
1,0,2,23.0,0,0,13.0,2,0,1,35,...,1,55,0,0,1,1,0,46.0,104,0.364706
2,0,3,32.0,0,0,7.925,2,0,1,33,...,2,84,0,1,5,0,0,96.0,324,0.21374
3,0,3,26.0,1,0,7.8542,2,0,2,35,...,3,79,0,0,2,0,2,78.0,113,0.402174
4,0,3,6.0,4,2,31.275,2,1,7,35,...,3,78,1,6,19,5,1,18.0,54,0.068182


In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Survived                     712 non-null    int64  
 1   Pclass                       712 non-null    int64  
 2   Age                          712 non-null    float64
 3   SibSp                        712 non-null    int64  
 4   Parch                        712 non-null    int64  
 5   Fare                         712 non-null    float64
 6   Embarked                     712 non-null    int64  
 7   Mapped_Title                 712 non-null    int64  
 8   FamilySize                   712 non-null    int64  
 9   TicketPrefix                 712 non-null    int64  
 10  TicketNumber                 712 non-null    int64  
 11  TicketNumberLengthGroup      712 non-null    int64  
 12  TicketNumberPrefix           712 non-null    int64  
 13  Mapped_Sex          

In [None]:
!pip install optuna

In [None]:
!pip install catboost

### Apply Optuna and train with CatBoost

In [None]:
import pandas as pd
import numpy as np
import optuna
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

# Loading the path
internal_train_data_path = '/content/drive/My Drive/titanic/data/internal_train_data_1226_encoded.csv'
internal_train_data = pd.read_csv(internal_train_data_path, low_memory=False)

# Specifying categorical variables
categorical_columns = ['Pclass', 'Embarked', 'Mapped_Title',
                       'TicketPrefix', 'TicketNumber', 'TicketNumberLengthGroup',
                       'TicketNumberPrefix', 'Mapped_Sex', 'AgeGroup',
                       'Age_Pclass_Group', 'FareGroup', 'FamilySizeGroup']

internal_train_data[categorical_columns] = internal_train_data[categorical_columns].astype('category')

# Specifying features and target variable
target = 'Survived'
X_train = internal_train_data.drop(columns=[target])
y_train = internal_train_data[target]

# Create a list of categorical variables
categorical_features = X_train.select_dtypes(include='category').columns.tolist()

#  Define the objective function for Optuna
def objective(trial):
    # Specify the range of hyperparameters
    param = {
        'iterations': trial.suggest_int('iterations', 500, 2000),
        'depth': trial.suggest_int('depth', 4, 12),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10),
        'bagging_temperature': trial.suggest_uniform('bagging_temperature', 0.0, 1.0),
        'random_seed': 42,
        'verbose': 0
    }

    # Set up cross-validation
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    accuracy_scores = []

    # Train and evaluate in each fold
    for train_index, val_index in kf.split(X_train, y_train):
        X_cv_train, X_cv_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_cv_train, y_cv_val = y_train.iloc[train_index], y_train.iloc[val_index]

        # Create a dataset for CatBoost
        train_pool = Pool(X_cv_train, label=y_cv_train, cat_features=categorical_features)
        val_pool = Pool(X_cv_val, label=y_cv_val, cat_features=categorical_features)

        # Train the model
        model = CatBoostClassifier(**param)
        model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=100, verbose=False)

        # Make predictions and calculate accuracy
        y_pred = model.predict(X_cv_val)
        accuracy = accuracy_score(y_cv_val, y_pred)
        accuracy_scores.append(accuracy)

    # Return the average accuracy from cross-validation
    return np.mean(accuracy_scores)

# Execute Optuna optimization
study = optuna.create_study(direction='maximize')  # Maximize Accuracy
study.optimize(objective, n_trials=50)

# Display the optimal parameters and score
print(f"Best Accuracy Score: {study.best_value}")
print(f"Best Parameters: {study.best_params}")


### Retrain the model with the parameters optimized by Optuna.

In [None]:

]from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score
import pandas as pd

# Loading the dataset
internal_train_data_path = '/content/drive/My Drive/titanic/data/internal_train_data_1226_encoded.csv'
internal_valid_data_path = '/content/drive/My Drive/titanic/data/internal_valid_data_1226_encoded.csv'

train = pd.read_csv(internal_train_data_path, low_memory=False)
valid = pd.read_csv(internal_valid_data_path, low_memory=False)

# Specifying categorical variables
categorical_columns = ['Pclass', 'Embarked', 'Mapped_Title',
                       'TicketPrefix', 'TicketNumber', 'TicketNumberLengthGroup',
                       'TicketNumberPrefix', 'Mapped_Sex', 'AgeGroup',
                       'Age_Pclass_Group', 'FareGroup', 'FamilySizeGroup']

train[categorical_columns] = train[categorical_columns].astype('category')
valid[categorical_columns] = valid[categorical_columns].astype('category')

# Specifying features and target variable
target = 'Survived'
X_train = train.drop(columns=[target])
y_train = train[target]
X_valid = valid.drop(columns=[target])
y_valid = valid[target]

# Create a list of categorical variables
categorical_features = X_train.select_dtypes(include='category').columns.tolist()

# Specify the range of hyperparameters
best_params = {
    'iterations': 846,
    'depth': 4,
    'learning_rate': 0.07221368563991136,
    'l2_leaf_reg': 0.0010643732330796643,
    'bagging_temperature': 0.6161463802357633,
    'random_seed': 42,
    'verbose': 0
}

# Create the dataset
final_train_pool = Pool(X_train, label=y_train, cat_features=categorical_features)
valid_pool = Pool(X_valid, label=y_valid, cat_features=categorical_features)

# Train the model
print("Training final model with internal_train_data...")
final_model = CatBoostClassifier(**best_params)
final_model.fit(final_train_pool, eval_set=valid_pool, early_stopping_rounds=100, verbose=True)

# Evaluate the performance on the validation data
y_valid_pred = final_model.predict(X_valid)
final_accuracy = accuracy_score(y_valid, y_valid_pred)
print(f"\nFinal Validation Accuracy on internal_valid_data: {final_accuracy:.4f}")

# Save the model
model_path = '/content/drive/My Drive/titanic/model/cat_0129_1.cbm'
final_model.save_model(model_path)
print(f"\nFinal CatBoost model has been successfully saved at: {model_path}")


In [None]:
# ✅ Loading the test data
test_path = '/content/drive/My Drive/titanic/data/test_1226_encoded.csv'
test = pd.read_csv(test_path)
test.info()

### Make predictions on the actual test data

In [None]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool

# Path to the saved catboost model
cat_model_path = '/content/drive/My Drive/titanic/model/cat_0129_1.cbm'

# Loading the model
cat_model = CatBoostClassifier()
cat_model.load_model(cat_model_path)

# Loading the test data
test_path = '/content/drive/My Drive/titanic/data/test_1226_encoded.csv'
test = pd.read_csv(test_path)

# Keep PassengerId (for CSV output)
passenger_ids = test['PassengerId']

# Remove PassengerId
test = test.drop(columns=['PassengerId'])

# Specifying categorical variables
categorical_columns = ['Pclass', 'Embarked', 'Mapped_Title',
                       'TicketPrefix', 'TicketNumber', 'TicketNumberLengthGroup',
                       'TicketNumberPrefix', 'Mapped_Sex', 'AgeGroup',
                       'Age_Pclass_Group', 'FareGroup', 'FamilySizeGroup']

test[categorical_columns] = test[categorical_columns].astype('category')

# Set categorical types the same as in the training data
for col in categorical_columns:
    if col in test.columns:
        test[col] = pd.Categorical(test[col], categories=test[col].cat.categories)

# Fill missing values in categorical columns with 'missing'
for col in categorical_columns:
    if col in test.columns:
        test[col] = test[col].cat.add_categories('missing').fillna('missing')

# Create the data format for CatBoost
test_pool = Pool(test, cat_features=categorical_columns)

# Make predictions with CatBoost (return class labels 0/1)
final_test_predictions = cat_model.predict(test_pool)

# Save the results as a CSV file
submission = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': final_test_predictions
})

submission_path = '/content/drive/My Drive/titanic/submission/cat_submission0129_1.csv'
submission.to_csv(submission_path, index=False)

print(f"\nFinal test predictions have been successfully saved to: {submission_path}")


In [None]:
# Calculate the proportions of survivors and non-survivors
survival_ratio = submission['Survived'].value_counts(normalize=True) * 100

# Display the result
survival_ratio

Unnamed: 0_level_0,proportion
Survived,Unnamed: 1_level_1
0,62.200957
1,37.799043
