In [3]:
# Cell 1: Imports and Loading Data
import pandas as pd
import numpy as np

# Loading the Titanic dataset directly from a URL (Github raw source)
url = 'https://raw.githubusercontent.com/adityatomar2004/Task-5-Data-Cleaning/main/Titanic-Dataset.csv'
df = pd.read_csv(url)

# Display the first 5 rows to see what the data looks like
print("--- First 5 Rows ---")
display(df.head())

# Display info to understand structure and datatypes
print("\n--- Dataset Info ---")
df.info()

--- First 5 Rows ---


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S



--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
# Cell 2: Check for Missing Values
print("--- Missing Values Count ---")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

--- Missing Values Count ---
Age         177
Cabin       687
Embarked      2
dtype: int64


In [9]:
# Cell 3: Handling Missing Data
# 1. 'Age' is numeric. We replace missing values with the median age.
df['Age'] = df['Age'].fillna(df['Age'].median())

# 2. 'Embarked' is categorical. We replace missing values with the mode (most common value).
mode_embarked = df['Embarked'].mode()[0]
df['Embarked'] = df['Embarked'].fillna(mode_embarked)

# 3. 'Cabin' has too many missing values. We will drop this column as it's not needed for basic analysis.
# Check if 'Cabin' column exists before dropping to prevent KeyError on multiple executions
if 'Cabin' in df.columns:
    df.drop(columns=['Cabin'], inplace=True)

# Verify cleaning
print("Remaining missing values:", df.isnull().sum().sum())

Remaining missing values: 0


In [10]:
# Cell 4: Removing Duplicates
# First, let's artificially add a duplicate row to demonstrate the function (since the raw dataset might not have any)
df = pd.concat([df, df.iloc[[0]]], ignore_index=True)

print(f"Row count before removing duplicates: {df.shape[0]}")

# Drop duplicates
df.drop_duplicates(inplace=True)

print(f"Row count after removing duplicates: {df.shape[0]}")

Row count before removing duplicates: 892
Row count after removing duplicates: 891


In [11]:
# Cell 5: Datatype Conversion
# Let's convert 'Age' from float to integer (e.g., 22.0 -> 22)
df['Age'] = df['Age'].astype(int)

# Verify the change
print(df['Age'].dtype)
df.head()

int64


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,S


In [12]:
# Cell 6: Feature Engineering
# Logic: Family Size = SibSp (Siblings/Spouse) + Parch (Parents/Children) + 1 (the passenger themselves)
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

display(df[['Name', 'SibSp', 'Parch', 'FamilySize']].head())

Unnamed: 0,Name,SibSp,Parch,FamilySize
0,"Braund, Mr. Owen Harris",1,0,2
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0,2
2,"Heikkinen, Miss. Laina",0,0,1
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0,2
4,"Allen, Mr. William Henry",0,0,1


In [13]:
# Cell 7: Save to CSV
df.to_csv('cleaned_titanic_data.csv', index=False)
print("File saved successfully!")

File saved successfully!
