Data Cleaning Notebook

In [None]:
import pandas as pd
import numpy as np

# Display full tables
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [None]:
df = pd.read_csv("../data/raw_dataset.csv")

before_shape = df.shape
print("Initial shape:", before_shape)
df.head()

In [None]:
print("DataFrame Info:")
df.info()

In [None]:
print("Missing values per column (full list):")
for col in df.columns:
    missing_count = df[col].isnull().sum()
    missing_pct = (missing_count / df.shape[0] * 100).round(2)
    print(f"{col}: {missing_count} missing ({missing_pct}%)")

In [None]:
duplicates = df.duplicated().sum()
print("Duplicate rows count:", duplicates)

In [None]:
print("Summary statistics (numeric + object):")
display(df.describe(include='all'))

In [None]:
df.replace(['?', ''], np.nan, inplace=True)
print("Normalized '?' and empty strings to NaN (if present).")

In [None]:
# Fill numeric columns
if 'Age' in df.columns:
    df['Age'] = df['Age'].fillna(df['Age'].median())

# Fill categorical columns
if 'Embarked' in df.columns:
    mode_value = df['Embarked'].mode()[0]
    df['Embarked'] = df['Embarked'].fillna(mode_value)

# Drop columns with >50% missing values
if 'Cabin' in df.columns and df['Cabin'].isnull().mean() > 0.5:
    df = df.drop(columns=['Cabin'])
    print("Dropped 'Cabin' column (too many missing values).")

print("Remaining missing values per column:")
for col in df.columns:
    print(f"{col}: {df[col].isnull().sum()} missing")

In [None]:
pre_rows = df.shape[0]
df = df.drop_duplicates()
print("Duplicates removed:", pre_rows - df.shape[0])

# Strip whitespace and normalize strings
for c in df.select_dtypes(include=['object']).columns:
    df[c] = df[c].astype(str).str.strip()

if 'Sex' in df.columns:
    df['Sex'] = df['Sex'].str.lower()
if 'Embarked' in df.columns:
    df['Embarked'] = df['Embarked'].str.upper()

In [None]:
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[col] = df[col].clip(lower=lower, upper=upper)

print("Applied IQR clipping to numeric columns.")

In [None]:
print("Rows before cleaning:", before_shape[0])
print("Rows after cleaning:", df.shape[0])

df.to_csv("../data/cleaned_dataset.csv", index=False)
print("Saved cleaned dataset to ../data/cleaned_dataset.csv")