In [1]:
import pandas as pd
import numpy as np

In [6]:
import pandas as pd
import seaborn as sns

# Load Titanic dataset from seaborn
df = sns.load_dataset("titanic")

print("Original Dataset Shape:", df.shape)
print(df.head())

# --- 1. Handle Missing Values ---
print("\nMissing values before cleaning:\n", df.isnull().sum())

# Fill numeric missing values (Age, Fare) with median
df["age"].fillna(df["age"].median(), inplace=True)
df["fare"].fillna(df["fare"].median(), inplace=True)

# Fill categorical missing values (embarked, embark_town) with mode
df["embarked"].fillna(df["embarked"].mode()[0], inplace=True)
df["embark_town"].fillna(df["embark_town"].mode()[0], inplace=True)

# Handle deck column: convert to string & replace NaN with "Unknown"
df["deck"] = df["deck"].astype(str).replace("nan", "Unknown")

# Drop irrelevant columns (optional, depends on your project)
df.drop(columns=["alive", "adult_male", "who", "alone"], inplace=True)

print("\nMissing values after cleaning:\n", df.isnull().sum())

# --- 2. Convert Categorical to Numeric ---
# Label Encoding (binary category: sex)
df["sex"] = df["sex"].map({"male": 0, "female": 1})

# One-Hot Encoding (multi-category: embarked)
df = pd.get_dummies(df, columns=["embarked"], drop_first=True)

print("\nDataset after encoding:\n", df.head())

# --- 3. Final Cleaned Dataset Info ---
print("\nFinal Dataset Shape:", df.shape)
print(df.info())

# --- 4. Save Cleaned Dataset ---
df.to_csv("titanic_cleaned.csv", index=False)
print("\nCleaned dataset saved as 'titanic_cleaned.csv'")


Original Dataset Shape: (891, 15)
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  

Missing values before cleaning:
 survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["age"].fillna(df["age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["fare"].fillna(df["fare"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are sett