In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# Read the data
data = pd.read_csv('train.csv')

# Drop the 'PassengerId' column
data.drop(labels=['PassengerId'], axis=1, inplace=True)

# Extract 'Title' from 'Name'
data['Title'] = data['Name'].str.split(',').str[1].str.split('.').str[0].str.strip()

# Fill missing values and extract cabin deck
data['Cabin'] = data['Cabin'].apply(lambda x: "X" if pd.isnull(x) or x == "T" else x)
data['Cabin'] = data['Cabin'].str[0]

# Define function to impute missing age values
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        if Pclass == 1:
            return 39
        elif Pclass == 2:
            return 29
        else:
            return 24
    else:
        return Age

# Apply impute_age function
data['Age'] = data[['Age', 'Pclass']].apply(impute_age, axis=1)

# Normalize Fare
data['Fare'] = data['Fare'] / (data['SibSp'] + data['Parch'] + 1)

# Drop unnecessary columns
data.drop(columns=['Name', 'SibSp', 'Parch', 'Ticket'], inplace=True)

# Separate features and target variable
X = data.drop(labels='Survived', axis=1)
y = data[['Survived']]

# Define categorical and numerical columns
categorical_cols = X.select_dtypes(include='O').columns
numerical_cols = X.select_dtypes(exclude='O').columns

# Define custom ranking for ordinal variables
sex_categorical = ['male', 'female']
cabin_categorical = ['X', 'G', 'F', 'E', 'D', 'C', 'B', 'A']
embarked_categorical = ['S', 'C', 'Q']
title_categorical = ['Mr', 'Miss', 'Mrs', 'Kid', 'Officer', 'Royalty']

# Define pipelines
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder', OrdinalEncoder(categories=[cabin_categorical])),  # Pass categories directly
    ('onehot', OneHotEncoder())
])

# Create preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('num_pipeline', num_pipeline, numerical_cols),
    ('cat_pipeline', cat_pipeline, categorical_cols)
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Fit and transform data
X_train_transformed = preprocessor.fit_transform(X_train)


ValueError: Shape mismatch: if categories is an array, it has to be of shape (n_features,).