In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# Load the training data from train.csv
train_data = pd.read_csv('train.csv')

In [5]:
# Set male to 1 and female to 0
train_data['Sex'] = train_data['Sex'].map({'male': 1, 'female': 0})

In [6]:
# Exclude non-numeric columns by selecting only numeric columns
numeric_data = train_data.select_dtypes(include=[np.number])

# Compute correlation matrix
correlation_matrix = numeric_data.corr()

# Display correlation of Survived with other columns
survived_correlation = correlation_matrix['Survived'].sort_values(ascending=False)
print(survived_correlation)

Survived       1.000000
Fare           0.257307
Parch          0.081629
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.077221
Pclass        -0.338481
Sex           -0.543351
Name: Survived, dtype: float64


# Ensemble Method

In [10]:
# Select features for model
X = train_data[['Fare', 'Pclass', 'Sex']]
y = train_data['Survived']

# Encode categorical data
le = LabelEncoder()
X.loc[:, 'Sex'] = le.fit_transform(X['Sex'])

# Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Instances
logreg = LogisticRegression()
rf = RandomForestClassifier()
svm = SVC(probability=True)
xgb = XGBClassifier()

# Ensemble - Voting Classifier
voting_clf = VotingClassifier(estimators=[
    ('lr', logreg), 
    ('rf', rf), 
    ('svm', svm),
    ('xgb', xgb)],
    voting='soft')

# Train ensemble
voting_clf.fit(X_train, y_train)

# Evaluate ensemble
accuracy = voting_clf.score(X_test, y_test)
print(f'Ensemble accuracy: {accuracy}')

Ensemble accuracy: 0.8268156424581006


# Grid Search

In [14]:
# Test different hyper parameters to see if model can be optimized further
from sklearn.model_selection import GridSearchCV

param_grid = {
    'lr__C': [0.1, 1, 10],
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [None, 5, 10],
    'svm__C': [0.1, 1, 10],
    'xgb__n_estimators': [100, 200],
    'xgb__learning_rate': [0.01, 0.1]
}

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=voting_clf, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)


Best parameters: {'lr__C': 0.1, 'rf__max_depth': 10, 'rf__n_estimators': 200, 'svm__C': 1, 'xgb__learning_rate': 0.01, 'xgb__n_estimators': 200}
Best score: 0.8173643258150299
