# Task 1: Data Cleaning & Preprocessing

This notebook demonstrates how to clean and preprocess data using the Titanic dataset.

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='whitegrid')


In [None]:

# Load the Titanic dataset
df = pd.read_csv('Titanic-Dataset.csv')  # Make sure the CSV is in the same directory
df.head()


In [None]:

# Basic information and statistics
print(df.info())
df.describe(include='all')


In [None]:

# Check and handle missing values
print(df.isnull().sum())

# Fill Age with median
df.fillna({'Age':df['Age'].median()}, inplace=True)

# Fill Embarked with mode
df.fillna({'Embarked':df['Embarked'].mode()[0]}, inplace=True)

# Drop Cabin (too many missing values)
df.drop(columns=['Cabin'], inplace=True)


In [None]:

# Encode 'Sex' and 'Embarked'
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)


In [None]:

from sklearn.preprocessing import StandardScaler

numeric_cols = ['Age', 'Fare', 'SibSp', 'Parch']
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])


In [None]:

# Boxplot for outlier visualization
plt.figure(figsize=(10, 6))
sns.boxplot(data=df[numeric_cols])
plt.title('Boxplot for Numerical Features')
plt.show()


In [None]:

# Remove outliers using IQR
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower) & (df[col] <= upper)]


In [None]:

# Final dataset check
df.info()
df.head()
