In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install optuna

### Apply Optuna and train with SVC

In [None]:
import pandas as pd
import numpy as np
import optuna
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from joblib import Parallel, delayed
import joblib

# ✅ Load data
file_path = '/content/drive/My Drive/titanic/data/internal_train_data_1226_encoded.csv'
train = pd.read_csv(file_path, low_memory=False)

# ✅ Separate target variable and features
target = 'Survived'
X = train.drop(columns=[target])
y = train[target].values  # ✅ Convert to NumPy

# ✅ Feature scaling (important for SVM)
scaler = StandardScaler()
X = scaler.fit_transform(X)  # ✅ Apply after defining X


# ✅ Define Optuna objective function
def objective(trial):
    param = {
        'C': trial.suggest_float('C', 1e-3, 1e3, log=True),
        'kernel': trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),
    }

    # Tune gamma only for 'rbf' and 'sigmoid'
    if param['kernel'] in ['rbf', 'sigmoid']:
        param['gamma'] = trial.suggest_categorical('gamma', ['scale', 'auto'])

    # Fix random seed if needed
    if param['kernel'] in ['poly', 'sigmoid']:
        np.random.seed(42)

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    def train_and_evaluate(train_index, val_index):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        if param['kernel'] == 'linear':
            model = LinearSVC(C=param['C'], max_iter=5000)  # ✅ Faster alternative for linear kernel
        else:
            model = SVC(**param)

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        return accuracy_score(y_val, y_pred)

    # ✅ Parallelize cross-validation for faster execution
    accuracy_scores = Parallel(n_jobs=-1)(
        delayed(train_and_evaluate)(train_index, val_index) for train_index, val_index in kf.split(X, y)
    )

    return np.mean(accuracy_scores)


# ✅ Run Optuna optimization
study = optuna.create_study(direction='maximize')  # Maximize accuracy
study.optimize(objective, n_trials=20)  # ✅ Reduce trials to 20 for faster execution

# ✅ Display best parameters and score
print("Best Accuracy Score:", study.best_value)
print("Best Parameters:", study.best_params)

# ✅ Save the scaler
scaler_path = '/content/drive/My Drive/titanic/model/scaler_0130.pkl'
joblib.dump(scaler, scaler_path)


### Retrain the model with the parameters optimized by Optuna.

In [5]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import joblib

# ✅ Load dataset
internal_train_data_path = '/content/drive/My Drive/titanic/data/internal_train_data_1226_encoded.csv'
internal_valid_data_path = '/content/drive/My Drive/titanic/data/internal_valid_data_1226_encoded.csv'

train = pd.read_csv(internal_train_data_path, low_memory=False)
valid = pd.read_csv(internal_valid_data_path, low_memory=False)

# ✅ Specify features and target variable
target = 'Survived'
X = train.drop(columns=[target])
y = train[target].values  # Convert y to NumPy array

# ✅ Load scaler (DO NOT reinitialize StandardScaler)
scaler_path = '/content/drive/My Drive/titanic/model/scaler_0130.pkl'
scaler = joblib.load(scaler_path)

# ✅ Apply feature scaling (important for SVM)
X = scaler.transform(X)

# ✅ Optimal parameters from Optuna
params = {
    'C': 2.3505881236914092,
    'kernel': 'linear'
}

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = []

# ✅ Performing cross-validation
for train_index, val_index in kf.split(X, y):
    X_cv_train, X_cv_val = X[train_index], X[val_index]
    y_cv_train, y_cv_val = y[train_index], y[val_index]  # No `.iloc[]` needed

    # Train the model
    svc_model = SVC(**params)
    svc_model.fit(X_cv_train, y_cv_train)

    # Make predictions and calculate accuracy
    y_cv_pred = svc_model.predict(X_cv_val)
    accuracy = accuracy_score(y_cv_val, y_cv_pred)
    accuracy_scores.append(accuracy)

# ✅ Return the average accuracy from cross-validation
mean_accuracy = np.mean(accuracy_scores)
print(f"Mean Cross-Validation Accuracy: {mean_accuracy:.4f}")

# ✅ Train the final model on full training data
svc_model = SVC(**params)
svc_model.fit(X, y)

# ✅ Apply the same scaling to validation data
X_valid = valid.drop(columns=[target])
y_valid = valid[target].values  # Convert to NumPy array

X_valid = scaler.transform(X_valid)  # Apply scaling

# ✅ Predict on the actual validation set
y_valid_pred = svc_model.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
print(f"Validation Accuracy on internal_valid_data: {valid_accuracy:.4f}")

# ✅ Save the trained model
model_path = '/content/drive/My Drive/titanic/model/svc_0130_1.pkl'
joblib.dump(svc_model, model_path)

print(f"Model saved at: {model_path}")


Mean Cross-Validation Accuracy: 0.8385
Validation Accuracy on internal_valid_data: 0.8268
Model saved at: /content/drive/My Drive/titanic/model/svc_0130_1.pkl


### Make predictions on the actual test data

In [6]:
import numpy as np
import pandas as pd
import joblib
from scipy.special import expit  # Import sigmoid function for probability conversion

# ✅ Load the trained SVC model
svc_model_path = '/content/drive/My Drive/titanic/model/svc_0130_1.pkl'
svc_model = joblib.load(svc_model_path)  # ✅ Fixed incorrect variable name

# ✅ Load the test dataset
test_path = '/content/drive/My Drive/titanic/data/test_1226_encoded.csv'
test = pd.read_csv(test_path)

# ✅ Keep PassengerId for CSV output
if 'PassengerId' in test.columns:
    passenger_ids = test['PassengerId']
    test = test.drop(columns=['PassengerId'])
else:
    print("Warning: 'PassengerId' column is missing. IDs will not be included in the output.")
    passenger_ids = np.arange(len(test))  # Assign dummy IDs

# ✅ Load scaler and apply feature scaling
scaler_path = '/content/drive/My Drive/titanic/model/scaler_0130.pkl'
scaler = joblib.load(scaler_path)
test = scaler.transform(test)  # ✅ Apply the same transformation used during training

# ✅ Make predictions using the trained SVC model
svc_raw_scores = svc_model.decision_function(test)  # Get raw decision function scores
svc_pred = expit(svc_raw_scores)  # Convert to probability using sigmoid

# ✅ Convert probabilities to 0 or 1 using a threshold of 0.5 (Survived or Not Survived)
final_test_predictions = (svc_pred > 0.5).astype(int)

# ✅ Save the results as a CSV file
submission = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': final_test_predictions
})

submission_path = '/content/drive/My Drive/titanic/submission/svc_submission0130_1.csv'
submission.to_csv(submission_path, index=False)

print(f"\nFinal test predictions have been successfully saved to: {submission_path}")



Final test predictions have been successfully saved to: /content/drive/My Drive/titanic/submission/svc_submission0130_1.csv


In [7]:
# Calculate the proportions of survivors and non-survivors
survival_ratio = submission['Survived'].value_counts(normalize=True) * 100

# Display the result
survival_ratio

Unnamed: 0_level_0,proportion
Survived,Unnamed: 1_level_1
0,61.004785
1,38.995215
