In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import numpy as np
import pandas as pd
import joblib
import lightgbm as lgb

# ✅ Load trained models
gb_model_path = '/content/drive/My Drive/titanic/model/gradient_0129_2.pkl'
lgb_model_path = '/content/drive/My Drive/titanic/model/best_lightgbm_model_0129_1.txt'
rf_model_path = '/content/drive/My Drive/titanic/model/rf_staking_0129_1.pkl'

gb_model = joblib.load(gb_model_path)  # Gradient Boosting
lgb_model = lgb.Booster(model_file=lgb_model_path)  # LightGBM
rf_model = joblib.load(rf_model_path)  # Random Forest

# ✅ Load test dataset
test_path = '/content/drive/My Drive/titanic/data/test_1226_encoded.csv'
test = pd.read_csv(test_path)

# ✅ Keep PassengerId for CSV output
if 'PassengerId' in test.columns:
    passenger_ids = test['PassengerId']
    test = test.drop(columns=['PassengerId'])
else:
    raise ValueError("Column 'PassengerId' is missing in the test dataset.")

# ✅ Define categorical columns (used in LightGBM)
categorical_columns = ['Pclass', 'Embarked', 'Mapped_Title',
                       'TicketPrefix', 'TicketNumber', 'TicketNumberLengthGroup',
                       'TicketNumberPrefix', 'Mapped_Sex', 'AgeGroup',
                       'Age_Pclass_Group', 'FareGroup', 'FamilySizeGroup']

X_test = test.copy()  # Copy test data

# ✅ Ensure categorical features are correctly formatted for LightGBM
for col in categorical_columns:
    if col in X_test.columns:
        X_test[col] = X_test[col].astype('category')

# ✅ Make predictions using each model
gb_pred = gb_model.predict_proba(X_test)[:, 1]  # Gradient Boosting
lgb_pred = lgb_model.predict(X_test, categorical_feature=categorical_columns)  # LightGBM
rf_pred = rf_model.predict_proba(X_test)[:, 1]  # Random Forest

# ✅ Ensemble prediction (average of all models)
ensemble_pred = (lgb_pred + rf_pred + gb_pred) / 3  # ✅ Now includes all models

# ✅ Convert probabilities to binary classification (0 or 1)
test_predictions = (ensemble_pred > 0.5).astype(int)

# ✅ Save the results as a CSV file
submission = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': test_predictions
})

submission_path = '/content/drive/My Drive/titanic/submission/ensemble_submission0130_1.csv'
submission.to_csv(submission_path, index=False)

print(f"\nTest predictions have been successfully saved to: {submission_path}")



Test predictions have been successfully saved to: /content/drive/My Drive/titanic/submission/ensemble_submission0130_1.csv


In [5]:
survival_ratio = submission['Survived'].value_counts(normalize=True) * 100
survival_ratio

Unnamed: 0_level_0,proportion
Survived,Unnamed: 1_level_1
0,65.07177
1,34.92823


In [6]:
import numpy as np
import pandas as pd
import joblib
import lightgbm as lgb
from scipy.special import expit  # Import sigmoid function for SVC probability conversion

# ✅ Load trained models
gb_model_path = '/content/drive/My Drive/titanic/model/gradient_0129_2.pkl'
lgb_model_path = '/content/drive/My Drive/titanic/model/best_lightgbm_model_0129_1.txt'
svc_model_path = '/content/drive/My Drive/titanic/model/svc_0130_1.pkl'

gb_model = joblib.load(gb_model_path)  # Gradient Boosting
lgb_model = lgb.Booster(model_file=lgb_model_path)  # LightGBM
svc_model = joblib.load(svc_model_path)  # SVC

# ✅ Load test dataset
test_path = '/content/drive/My Drive/titanic/data/test_1226_encoded.csv'
test = pd.read_csv(test_path)

# ✅ Keep PassengerId for CSV output
if 'PassengerId' in test.columns:
    passenger_ids = test['PassengerId']
    test = test.drop(columns=['PassengerId'])
else:
    raise ValueError("Column 'PassengerId' is missing in the test dataset.")

# ✅ Define categorical columns (used in LightGBM)
categorical_columns = ['Pclass', 'Embarked', 'Mapped_Title',
                       'TicketPrefix', 'TicketNumber', 'TicketNumberLengthGroup',
                       'TicketNumberPrefix', 'Mapped_Sex', 'AgeGroup',
                       'Age_Pclass_Group', 'FareGroup', 'FamilySizeGroup']

X_test = test.copy()  # Copy test data

# ✅ Ensure categorical features are correctly formatted for LightGBM
for col in categorical_columns:
    if col in X_test.columns:
        X_test[col] = X_test[col].astype('category')

# ✅ Load and apply scaler for SVC (other models don't need scaling)
scaler_path = '/content/drive/My Drive/titanic/model/scaler_0130.pkl'
scaler = joblib.load(scaler_path)
X_test_scaled = scaler.transform(X_test)  # ✅ Only used for SVC

# ✅ Make predictions using each model
gb_pred = gb_model.predict_proba(X_test)[:, 1]  # Gradient Boosting
lgb_pred = lgb_model.predict(X_test, categorical_feature=categorical_columns)  # LightGBM

# ✅ Fix for SVC: Use decision_function() + sigmoid (instead of predict_proba)
svc_raw_scores = svc_model.decision_function(X_test_scaled)  # Get raw decision function scores
svc_pred = expit(svc_raw_scores)  # Convert to probability using sigmoid

# ✅ Ensemble prediction (average of all models)
ensemble_pred = (gb_pred + lgb_pred + svc_pred) / 3  # ✅ Now includes all models correctly

# ✅ Convert probabilities to binary classification (0 or 1)
test_predictions = (ensemble_pred > 0.5).astype(int)

# ✅ Save the results as a CSV file
submission = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': test_predictions
})

submission_path = '/content/drive/My Drive/titanic/submission/ensemble_submission0130_2.csv'
submission.to_csv(submission_path, index=False)

print(f"\nTest predictions have been successfully saved to: {submission_path}")


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations



Test predictions have been successfully saved to: /content/drive/My Drive/titanic/submission/ensemble_submission0130_2.csv


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [7]:
survival_ratio = submission['Survived'].value_counts(normalize=True) * 100
survival_ratio

Unnamed: 0_level_0,proportion
Survived,Unnamed: 1_level_1
0,63.875598
1,36.124402
