In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.metrics import accuracy_score
import re

# Load the data
train_data = pd.read_csv('../input/titanic/train.csv')
test_data = pd.read_csv('../input/titanic/test.csv')

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = X.copy()
        
        # Extract title from Name
        X_['Title'] = X_['Name'].apply(lambda x: re.search(' ([A-Z][a-z]+)\.', x).group(1))
        title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Col": 7, "Major": 7, "Mlle": 8, "Countess": 8, "Ms": 2, "Lady": 8, "Jonkheer": 8, "Don": 9, "Dona": 9, "Mme": 8,"Capt": 7, "Sir": 9}
        X_['Title'] = X_['Title'].map(title_mapping)
        
        # Family size
        X_['FamilySize'] = X_['SibSp'] + X_['Parch'] + 1
        
        # Is alone
        X_['IsAlone'] = (X_['FamilySize'] == 1).astype(int)
        
        # Bin fare
        X_['FareBin'] = pd.qcut(X_['Fare'], 5, labels=False)
        
        # Bin age
        X_['AgeBin'] = pd.cut(X_['Age'], 5, labels=False)
        
        # Drop unnecessary columns
        X_ = X_.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1)
        
        return X_

class DataPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = X.copy()
        
        # Convert categorical variables to numeric
        X_['Sex'] = X_['Sex'].map({'female': 0, 'male': 1})
        X_['Embarked'] = X_['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
        
        return X_

class EnsembleClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.rf = RandomForestClassifier(n_estimators=100, random_state=42)
        self.gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
        self.svm = SVC(probability=True, random_state=42)
        self.lr = LogisticRegression(random_state=42)

    def fit(self, X, y):
        self.rf.fit(X, y)
        self.gb.fit(X, y)
        self.svm.fit(X, y)
        self.lr.fit(X, y)
        return self

    def predict(self, X):
        predictions = np.column_stack([
            self.rf.predict_proba(X)[:, 1],
            self.gb.predict_proba(X)[:, 1],
            self.svm.predict_proba(X)[:, 1],
            self.lr.predict_proba(X)[:, 1]
        ])
        return (predictions.mean(axis=1) > 0.5).astype(int)

# Create the pipeline
pipeline = Pipeline([
    ('feature_engineer', FeatureEngineer()),
    ('preprocessor', DataPreprocessor()),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('feature_selector', SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))),
    ('classifier', EnsembleClassifier())
])

# Prepare the data
X = train_data.drop('Survived', axis=1)
y = train_data['Survived']

# Perform cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')

print(f"Cross-validation scores: {scores}")
print(f"Mean accuracy: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

# Fit the pipeline on all training data
pipeline.fit(X, y)

# Make predictions on test data
test_predictions = pipeline.predict(test_data)

# Create submission file
submission = pd.DataFrame({
    "PassengerId": test_data["PassengerId"],
    "Survived": test_predictions
})
submission.to_csv('submission.csv', index=False)
print("Submission file created.")

Cross-validation scores: [0.81005587 0.80898876 0.82022472 0.7752809  0.82022472]
Mean accuracy: 0.8070 (+/- 0.0331)
Submission file created.
