In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

titanic_data = pd.read_csv('titanic.csv')

titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
#checking missing values
titanic_data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

Age : 86 missing values
Fare : 1 missing value
Cabin : 327 missing values

In [7]:
#Handling 'Age' missing values
titanic_data['Age'].fillna(titanic_data['Age'].median(), inplace=True)

#Handling 'Cabin' missing values
titanic_data.drop(columns=['Cabin'], inplace=True)

#Handling 'Fare' missing values
titanic_data['Fare'].fillna(titanic_data['Fare'].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_data['Age'].fillna(titanic_data['Age'].median(), inplace=True)


KeyError: "['Cabin'] not found in axis"

In [8]:
# Check for duplicates
duplicate_rows = titanic_data.duplicated().sum()

# Remove duplicates if any
titanic_data.drop_duplicates(inplace=True)

# Display number of duplicate rows removed
duplicate_rows

0

In [9]:
# Survival rate
survival_rate = titanic_data['Survived'].mean() * 100
survival_rate

36.36363636363637

36.36% of passengers survived the Titanic disaster.

In [10]:
# Summary statistics for Age
age_stats = titanic_data['Age'].describe()
age_stats

count    418.000000
mean      29.599282
std       12.703770
min        0.170000
25%       23.000000
50%       27.000000
75%       35.750000
max       76.000000
Name: Age, dtype: float64

The average age of passengers is around 29 years.
Most passengers are aged between 20 and 40.

In [11]:
# Gender distribution in percentages
gender_distribution = titanic_data['Sex'].value_counts(normalize=True) * 100
gender_distribution

Sex
male      63.636364
female    36.363636
Name: proportion, dtype: float64

63.63% of passengers were male, and 36.36% were female.

In [12]:
# Embarkation distribution in percentages
embarked_distribution = titanic_data['Embarked'].value_counts(normalize=True) * 100
embarked_distribution

Embarked
S    64.593301
C    24.401914
Q    11.004785
Name: proportion, dtype: float64

Most passengers boarded at Southampton (64.59%).

In [13]:
# Survival rate by class
survival_rate_by_class = titanic_data.groupby('Pclass')['Survived'].mean() * 100
survival_rate_by_class

Pclass
1    46.728972
2    32.258065
3    33.027523
Name: Survived, dtype: float64

1st class passengers had the highest survival rate of 46.72%.
2nd class passengers had the lowest survival rate of 32.25%.

In [14]:
# Survival rate by gender
survival_rate_by_gender = titanic_data.groupby('Sex')['Survived'].mean() * 100
survival_rate_by_gender

Sex
female    100.0
male        0.0
Name: Survived, dtype: float64

All the Female passengers have survived and no male passengers have survived

In [15]:
# Survival rate by port of embarkation
survival_rate_by_embarkation = titanic_data.groupby('Embarked')['Survived'].mean() * 100
survival_rate_by_embarkation

Embarked
C    39.215686
Q    52.173913
S    32.592593
Name: Survived, dtype: float64

Passengers who boarded at Queenstown (Q) had the highest survival rate of 52.17%.

In [16]:
# Survival rate by age
survival_rate_by_age = titanic_data.groupby('Age')['Survived'].mean() * 100
survival_rate_by_age

Age
0.17     100.000000
0.33       0.000000
0.75       0.000000
0.83       0.000000
0.92     100.000000
            ...    
62.00      0.000000
63.00     50.000000
64.00     66.666667
67.00      0.000000
76.00    100.000000
Name: Survived, Length: 79, dtype: float64

Summary of Findings
Survival Rate: Overall, 36.36% of passengers survived the Titanic disaster.
Gender: All the Female passengers have survived and no male passengers have survived
Passenger Class:
1st class passengers had the highest survival rate of 46.72%.
2nd class passengers had the lowest survival rate of 32.25%.
Age: Younger passengers had a slightly higher survival rate.
Port of Embarkation: Passengers who boarded at Queenstown (Q) had the highest survival rate of 52.17%.