In [None]:
# 📌 TASK 1: ETL Pipeline using Titanic Dataset

# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [None]:
# Step 2: Load Dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)
print("✅ Data loaded successfully")
df.head()


In [None]:
# Step 3: Check for Missing Values
df.isnull().sum()


In [None]:
# Step 4: Fill Missing Values (Safe way without FutureWarning)
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
df['Cabin'] = df['Cabin'].fillna('Unknown')
print("✅ Missing values handled")


In [None]:
# Step 5: Encode Categorical Features
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Embarked'] = le.fit_transform(df['Embarked'])
df['Cabin'] = df['Cabin'].apply(lambda x: x[0])  # Extract first letter
df['Cabin'] = le.fit_transform(df['Cabin'])

print("✅ Categorical columns encoded")


In [None]:
# Step 6: Scale Numeric Features
scaler = StandardScaler()
numeric_cols = ['Age', 'Fare']
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

print("✅ Numeric columns scaled")


In [None]:
# Step 7: Export Cleaned Data
df.to_csv("titanic_cleaned.csv", index=False)
print("✅ Cleaned data exported to titanic_cleaned.csv")
