In [21]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load dataset
url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv"
df = pd.read_csv(url)

# Handle missing values
df.fillna({
    'age': df['age'].median(),
    'embarked': df['embarked'].mode()[0],
    'deck': 'Unknown'
}, inplace=True)

# Convert categorical columns to numerical
df['pclass'] = df['pclass'].astype('category').cat.codes
df['survived'] = df['survived'].astype(int)

# Remove outliers using Z-score
z_scores = np.abs((df['age'] - df['age'].mean()) / df['age'].std())
df = df[z_scores < 3]

# Normalize numeric columns
scaler = MinMaxScaler()
df[['fare', 'age']] = scaler.fit_transform(df[['fare', 'age']])

# Feature Engineering: Create a new feature
df['family_size'] = df['sibsp'] + df['parch'] + 1

# Save preprocessed dataset
df.to_csv('preprocessed_titanic.csv', index=False)
print("Preprocessing Completed and saved to preprocessed_titanic.csv")


Preprocessing Completed and saved to preprocessed_titanic.csv
