In [6]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score


train_data = pd.read_csv('../../dataset/titanic/train.csv')
test_data = pd.read_csv('../../dataset/titanic/test.csv')


# Data Preprocessing
# Handling categorical features
categorical_features = ['Sex', 'Embarked']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Handling numerical features
numerical_features = ['Age', 'Fare', 'SibSp', 'Parch', 'Pclass']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Combine transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(random_state=42))])

# Prepare the data
X = train_data.drop(columns=['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'])
y = train_data['Survived']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Validate the model
y_pred = model.predict(X_val)
validation_accuracy = accuracy_score(y_val, y_pred)

# Calculate the predicted probabilities for the validation set
y_pred_proba = model.predict_proba(X_val)[:, 1]


# Calculate the ROC AUC score
roc_auc = roc_auc_score(y_val, y_pred_proba)

# Prepare test data and make predictions
X_test = test_data.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])
test_predictions = model.predict(X_test)

validation_accuracy, roc_auc, roc_auc2


(0.8100558659217877, 0.8922136422136423, 0.8001930501930502)