In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [11]:
# Load the dataset into your preferred data analysis tool
df = pd.read_csv(r"/content/Titanic_Machine Learning from Disaster.csv")

## Task 1 – Data Cleaning

In [12]:
# Display the first five rows of the dataset to get an overview of the available columns and data types.
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [14]:
#  Identify the total number of missing values in each column and provide a summary.
df.isnull().sum()

# Out of total 891 rows of Data, Age has 177 Null Values and Cabin has 687 Null Values along with 2 Null in Embarked Column.

'\nOut of total 891 rows of Data, Age has 177 Null Values and Cabin has 687 Null Values along with 2 Null in Embarked Column.\n'

In [15]:
# Data Cleaning:
"a. Handle missing values:"
# Determine the percentage of missing values in the "Age" column and decide on an appropriate strategy to handle them.

missing_age_percentage = round((df['Age'].isnull().sum() / len(df)) * 100,2)
print("Before Imputing",missing_age_percentage,'%')


"""
Here are a few common strategies:

Imputation with Mean/Median: If the percentage of missing values is relatively low, you can impute the missing ages with the mean or median age of the non-missing values. This is a simple approach to maintain data integrity.

Predictive Modeling: For a more advanced approach, you can use machine learning models to predict missing age values based on other features. This can capture more complex relationships in the data.

Dropping Rows: If the percentage of missing values is very high, you might consider dropping rows with missing age values. However, be cautious with this approach, as it can lead to a loss of valuable data if not done judiciously.

Categorization: You can categorize passengers into age groups (e.g., children, adults, seniors) and assign them an appropriate category based on available information.
"""
# I choose Impute with Median as it does no skew the data like mean.

# Calculate the median age
median_age = df['Age'].median()

# Impute missing values with the median age
df['Age'].fillna(median_age, inplace=True)

missing_age_percentage = round((df['Age'].isnull().sum() / len(df)) * 100,2)
print("After Imputing",missing_age_percentage,'%')

Before Imputing 19.87 %
After Imputing 0.0 %


In [16]:
# Replace missing values in the "Cabin" column with "Unknown."
df['Cabin'].fillna('Unknown', inplace=True)

# Verify that missing values have been replaced
print("Missing values in 'Cabin' column after replacement:", df['Cabin'].isnull().sum())

Missing values in 'Cabin' column after replacement: 0


In [17]:
# Remove rows with missing values in the "Embarked" column.
df.dropna(subset=['Embarked'], inplace=True)

# Verify that missing value rows have been removed
print("Missing values in 'Embarked' column after removal:", df['Embarked'].isnull().sum())

Missing values in 'Embarked' column after removal: 0


In [22]:
# Identify any inconsistencies in the "Sex" column and resolve them if necessary.
# Check unique values in the "Sex" column
unique_sex_values = df['Sex'].unique()
print("Unique values in 'Sex' column:", unique_sex_values)

df['Sex'].value_counts()
# Observation -  Males are More then Females

Unique values in 'Sex' column: ['male' 'female']


male      577
female    312
Name: Sex, dtype: int64

In [23]:
# Convert the "Fare" column to a numeric data type if it is currently stored as a string.

# Check the data type of the "Fare" column
print("Data type of 'Fare' column before conversion:", df['Fare'].dtype)

# Convert the "Fare" column to a numeric data type
df['Fare'] = pd.to_numeric(df['Fare'], errors='coerce')

# Check the data type of the "Fare" column after conversion
print("Data type of 'Fare' column after conversion:", df['Fare'].dtype)

Data type of 'Fare' column before conversion: float64
Data type of 'Fare' column after conversion: float64


In [25]:
# Group the "Age" column into bins (e.g., 0-9, 10-19, 20-29, etc.) and create a new column called "AgeGroup" to store the bin labels

# Define the age bins and labels
age_bins = [0, 9, 19, 29, 39, 49, 59, 69, 100]  # Define your age bins as needed
age_labels = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70+']  # Define labels for each bin

# Create a new column "AgeGroup" with the bin labels
df['AgeGroup'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=False)

# Display the DataFrame with the "AgeGroup" column
df[['Age', 'AgeGroup']]

Unnamed: 0,Age,AgeGroup
0,22.0,20-29
1,38.0,30-39
2,26.0,20-29
3,35.0,30-39
4,35.0,30-39
...,...,...
886,27.0,20-29
887,19.0,20-29
888,28.0,20-29
889,26.0,20-29
