In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load the Titanic dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
titanic_data = pd.read_csv(url)

# Display basic information about the dataset
print("Dataset Shape:", titanic_data.shape)
print("Columns:", titanic_data.columns)
titanic_data.head()

Dataset Shape: (891, 12)
Columns: Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Define numerical and categorical columns
numerical_features = ["Age", "Fare"]
categorical_features = ["Sex", "Embarked", "Pclass"]

# Define transformers for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [4]:
# Define the full pipeline
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])


In [5]:
# Define the target and features
X = titanic_data[["Age", "Fare", "Sex", "Embarked", "Pclass"]]
y = titanic_data["Survived"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the splits
print("Training Data Shape:", X_train.shape)
print("Testing Data Shape:", X_test.shape)

Training Data Shape: (712, 5)
Testing Data Shape: (179, 5)


In [6]:
# Train the pipeline
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.79

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.83      0.82       105
           1       0.75      0.74      0.75        74

    accuracy                           0.79       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.79      0.79      0.79       179



In [7]:
# Extract feature names after preprocessing
feature_names = pipeline.named_steps["preprocessor"].transformers_[0][2] + \
                list(pipeline.named_steps["preprocessor"].transformers_[1][1]["onehot"].get_feature_names_out())

# Extract feature importances
importances = pipeline.named_steps["classifier"].feature_importances_

# Create a DataFrame for visualization
feature_importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

# Display the feature importance
print(feature_importance_df)

     Feature  Importance
1       Fare    0.299551
0        Age    0.273678
2  x0_female    0.164147
3    x0_male    0.125234
9       x2_3    0.062640
7       x2_1    0.024349
8       x2_2    0.017658
6       x1_S    0.014603
4       x1_C    0.010730
5       x1_Q    0.007410


In [8]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring="accuracy")
print(f"Cross-Validation Accuracy: {np.mean(cv_scores):.2f} ± {np.std(cv_scores):.2f}")

Cross-Validation Accuracy: 0.81 ± 0.02


In [9]:
# Manual preprocessing
# Handle missing values
X_manual = X.copy()
X_manual["Age"].fillna(X_manual["Age"].median(), inplace=True)
X_manual["Embarked"].fillna(X_manual["Embarked"].mode()[0], inplace=True)

# Encode categorical variables
X_manual = pd.get_dummies(X_manual, columns=categorical_features, drop_first=True)

# Standardize numerical features
scaler = StandardScaler()
X_manual[numerical_features] = scaler.fit_transform(X_manual[numerical_features])

# Split the manually preprocessed data
X_train_manual, X_test_manual, y_train_manual, y_test_manual = train_test_split(
    X_manual, y, test_size=0.2, random_state=42
)

# Train a RandomForestClassifier
manual_model = RandomForestClassifier(random_state=42)
manual_model.fit(X_train_manual, y_train_manual)

# Make predictions
y_pred_manual = manual_model.predict(X_test_manual)

# Evaluate the manually implemented pipeline
manual_accuracy = accuracy_score(y_test_manual, y_pred_manual)
print(f"Manual Pipeline Accuracy: {manual_accuracy:.2f}")

# Compare with the existing pipeline
print(f"Existing Pipeline Accuracy: {accuracy:.2f}")

Manual Pipeline Accuracy: 0.78
Existing Pipeline Accuracy: 0.79


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_manual["Age"].fillna(X_manual["Age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_manual["Embarked"].fillna(X_manual["Embarked"].mode()[0], inplace=True)
