In [109]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [110]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from typing import List, Tuple

class DataLoader:
    def __init__(self, train_path: str, test_path: str):
        self.train_path = train_path
        self.test_path = test_path
    
    def load_data(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        print("Loading data...", end=" ")
        train_data = pd.read_csv(self.train_path)
        test_data = pd.read_csv(self.test_path)
        print("Done.")
        return train_data, test_data

class FeatureEngineer:
    @staticmethod
    def extract_title(name: str) -> str:
        return name.split(',')[1].split('.')[0].strip()

    @staticmethod
    def group_titles(title: str) -> str:
        if title in ['Mr', 'Miss', 'Mrs', 'Master']:
            return title
        elif title in ['Dr', 'Rev', 'Col', 'Major', 'Mlle', 'Ms', 'Lady', 'Sir', 'Mme', 'Capt', 'Countess', 'Don', 'Jonkheer']:
            return 'Rare'
        else:
            return 'Other'
    
    @staticmethod
    def engineer_features(data: pd.DataFrame) -> pd.DataFrame:
        print("Starting feature engineering...", end=" ")
        data['Title'] = data['Name'].apply(FeatureEngineer.extract_title).apply(FeatureEngineer.group_titles)
        data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
        data['IsAlone'] = (data['FamilySize'] == 1).astype(int)
        data['TicketFrequency'] = data.groupby('Ticket')['Ticket'].transform('count')
        data['FarePerPerson'] = data['Fare'] / data['FamilySize']
        
        # Age and Fare Binning
        data['AgeBin'] = pd.cut(data['Age'], bins=[0, 12, 18, 50, 80], labels=['Child', 'Teen', 'Adult', 'Senior'])
        data['FareBin'] = pd.qcut(data['Fare'], 4, labels=['Low', 'Medium', 'High', 'VeryHigh'])
        
        # Extract Deck from Cabin
        data['Deck'] = data['Cabin'].str[0]
        
        # Extract Ticket Prefix
        data['TicketPrefix'] = data['Ticket'].apply(lambda x: x.split()[0] if not x.split()[0].isdigit() else 'None')
        
        print("Done.")
        return data

class Preprocessor:
    @staticmethod
    def handle_missing_values(data: pd.DataFrame) -> pd.DataFrame:
        print("Handling missing values...", end=" ")
        for column in ['Age', 'Embarked', 'Fare', 'Deck']:
            if column in data.columns:
                if column == 'Age':
                    data[column] = data[column].fillna(data[column].median())
                elif column == 'Embarked':
                    data[column] = data[column].fillna(data[column].mode()[0])
                elif column == 'Fare':
                    data[column] = data[column].fillna(data[column].median())
                elif column == 'Deck':
                    data[column] = data[column].fillna('Unknown')
        data = data.drop(columns=['Cabin', 'Name', 'Ticket'], errors='ignore')
        print("Done.")
        return data
    
    @staticmethod
    def encode_categorical(data: pd.DataFrame) -> pd.DataFrame:
        print("Encoding categorical variables...", end=" ")
        data = pd.get_dummies(data, columns=['Sex', 'Embarked', 'Title', 'AgeBin', 'FareBin', 'Deck', 'TicketPrefix'], dummy_na=True)
        print("Done.")
        return data

class ModelTrainer:
    def __init__(self, model, param_grid: dict):
        self.model = model
        self.param_grid = param_grid
        self.best_model = None
    
    def train(self, X_train: np.ndarray, y_train: np.ndarray) -> None:
        print("Starting model training with GridSearchCV...")
        grid_search = GridSearchCV(self.model, self.param_grid, cv=5, n_jobs=-1, scoring='accuracy')
        grid_search.fit(X_train, y_train)
        self.best_model = grid_search.best_estimator_
        print("Model training completed.")
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Best cross-validation score: {grid_search.best_score_}")
    
    def validate(self, X_val: np.ndarray, y_val: np.ndarray) -> float:
        print("Validating model...", end=" ")
        y_pred = self.best_model.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        print(f"Done. Validation Accuracy: {accuracy}")
        return accuracy
    
    def get_mispredictions(self, X_val: np.ndarray, y_val: np.ndarray, feature_names: List[str]) -> pd.DataFrame:
        print("Getting mispredictions...")
        y_pred = self.best_model.predict(X_val)
        X_val_df = pd.DataFrame(X_val, columns=feature_names)
        X_val_df['True'] = y_val
        X_val_df['Pred'] = y_pred
        mispredictions = X_val_df[X_val_df['True'] != X_val_df['Pred']]
        print("Mispredictions obtained.")
        return mispredictions

class TitanicPipeline:
    def __init__(self, train_path: str, test_path: str, output_path: str):
        self.train_path = train_path
        self.test_path = test_path
        self.output_path = output_path
        self.train_data = None
        self.test_data = None
        self.model_trainer = None
        self.scaler = StandardScaler()
    
    def run(self):
        print("Pipeline started...")
        # Load data
        data_loader = DataLoader(self.train_path, self.test_path)
        self.train_data, self.test_data = data_loader.load_data()
        
        # Feature engineering
        self.train_data = FeatureEngineer.engineer_features(self.train_data)
        self.test_data = FeatureEngineer.engineer_features(self.test_data)
        
        # Preprocess data
        self.train_data = Preprocessor.handle_missing_values(self.train_data)
        self.train_data = Preprocessor.encode_categorical(self.train_data)
        self.test_data = Preprocessor.handle_missing_values(self.test_data)
        self.test_data = Preprocessor.encode_categorical(self.test_data)
        
        # Align data
        print("Aligning train and test data...", end=" ")
        self.train_data, self.test_data = self.train_data.align(self.test_data, join='left', axis=1, fill_value=0)
        print("Done.")
        
        # Ensure no missing values after alignment
        print("Checking for missing values after alignment...", end=" ")
        if self.train_data.isnull().values.any():
            print("Found missing values in training data. Handling missing values again...", end=" ")
            self.train_data = self.train_data.fillna(0)
            print("Done.")
        if self.test_data.isnull().values.any():
            print("Found missing values in test data. Handling missing values again...", end=" ")
            self.test_data = self.test_data.fillna(0)
            print("Done.")
        print("No missing values found.")
        
        # Select features
        print("Selecting features...", end=" ")
        features = [col for col in self.train_data.columns if col not in ['PassengerId', 'Survived']]
        X = self.train_data[features]
        y = self.train_data['Survived']
        print("Done.")
        
        # Split data
        print("Splitting data into training and validation sets...", end=" ")
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
        print("Done.")
        
        # Scale features
        print("Scaling features...", end=" ")
        X_train = self.scaler.fit_transform(X_train)
        X_val = self.scaler.transform(X_val)
        X_test = self.scaler.transform(self.test_data[features])
        print("Done.")
        
        # Train model with GridSearchCV
        param_grid = {
            'n_estimators': [200],
            'max_depth': [20],
            'min_samples_split': [2],
            'min_samples_leaf': [2],
            'max_features': ['log2'],
            'bootstrap': [False],
            'criterion': ['gini']
        }
        self.model_trainer = ModelTrainer(RandomForestClassifier(random_state=42), param_grid)
        self.model_trainer.train(X_train, y_train)
        self.model_trainer.validate(X_val, y_val)
        
        # Ensure no missing values in test data
        if np.isnan(X_test).any():
            print("Columns with NaN values in test data:", self.test_data.columns[self.test_data.isna().any()].tolist())
            raise ValueError("Test data contains NaN values after preprocessing.")

        # Predict on test data
        print("Generating predictions on test data...", end=" ")
        test_predictions = self.model_trainer.best_model.predict(X_test)
        print("Done.")
        
        # Save submission
        self.save_submission(test_predictions)
        print("Pipeline completed.")
        
        # Analyze mispredictions
        analysis_tool = AnalysisTool()
        mispredictions = analysis_tool.get_mispredictions(X_val, y_val, self.model_trainer, features)
        print("Mispredictions: \n", mispredictions)
    
    def save_submission(self, predictions: np.ndarray) -> None:
        print(f"Saving submission to {self.output_path}...", end=" ")
        submission = pd.DataFrame({
            'PassengerId': self.test_data['PassengerId'],
            'Survived': predictions
        })
        submission.to_csv(self.output_path, index=False)
        print(f"Done. Submission file saved as {self.output_path}")

class AnalysisTool:
    @staticmethod
    def get_mispredictions(X_val: np.ndarray, y_val: np.ndarray, model_trainer: ModelTrainer, feature_names: List[str]) -> pd.DataFrame:
        print("Analyzing mispredictions...")
        mispredictions = model_trainer.get_mispredictions(X_val, y_val, feature_names)
        return mispredictions

# Usage
pipeline = TitanicPipeline('/kaggle/input/titanic/train.csv', '/kaggle/input/titanic/test.csv', '/kaggle/working/submission.csv')
pipeline.run()


Pipeline started...
Loading data... Done.
Starting feature engineering... Done.
Starting feature engineering... Done.
Handling missing values... Done.
Encoding categorical variables... Done.
Handling missing values... Done.
Encoding categorical variables... Done.
Aligning train and test data... Done.
Checking for missing values after alignment... Found missing values in test data. Handling missing values again... Done.
No missing values found.
Selecting features... Done.
Splitting data into training and validation sets... Done.
Scaling features... Done.
Starting model training with GridSearchCV...
Model training completed.
Best parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Best cross-validation score: 0.8356643356643356
Validating model... Done. Validation Accuracy: 0.8268156424581006
Generating predictions on test data... Done.
Saving submission to /kaggle/working/submi