# Titanic Dataset EDA - Complete Analysis

## 1. Import Libraries

In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
pd.set_option('display.max_columns', None)

## 2. Load and Inspect Data

In [ ]:
titanic = sns.load_dataset('titanic')
titanic.head()

In [ ]:
print("Dataset shape:", titanic.shape)
titanic.info()

In [ ]:
titanic.describe(include='all')

In [ ]:
titanic.isnull().sum()

## 3. Data Cleaning

In [ ]:
median_ages = titanic.groupby(['pclass', 'sex'])['age'].median()

def fill_age(row):
    if pd.isnull(row['age']):
        return median_ages[row['pclass']][row['sex']]
    return row['age']

titanic['age'] = titanic.apply(fill_age, axis=1)

embarked_mode = titanic['embarked'].mode()[0]
titanic['embarked'] = titanic['embarked'].fillna(embarked_mode)

titanic.drop(['deck', 'embark_town'], axis=1, inplace=True)

titanic.isnull().sum()

## 4. Univariate Analysis

In [ ]:
sns.countplot(x='survived', data=titanic)
plt.title('Survival Distribution (0 = Died, 1 = Survived)')
plt.show()

In [ ]:
sns.countplot(x='pclass', data=titanic)
plt.title('Passenger Class Distribution')
plt.show()

In [ ]:
sns.countplot(x='sex', data=titanic)
plt.title('Gender Distribution')
plt.show()

In [ ]:
sns.histplot(titanic['age'], bins=30, kde=True)
plt.title('Age Distribution')
plt.show()

In [ ]:
sns.histplot(titanic['fare'], bins=30, kde=True)
plt.title('Fare Distribution')
plt.show()

## 5. Bivariate Analysis

In [ ]:
sns.barplot(x='pclass', y='survived', data=titanic)
plt.title('Survival Rate by Passenger Class')
plt.ylabel('Survival Rate')
plt.show()

In [ ]:
sns.barplot(x='sex', y='survived', data=titanic)
plt.title('Survival Rate by Gender')
plt.ylabel('Survival Rate')
plt.show()

In [ ]:
sns.boxplot(x='survived', y='age', data=titanic)
plt.title('Age Distribution by Survival Status')
plt.show()

In [ ]:
sns.boxplot(x='survived', y='fare', data=titanic)
plt.title('Fare Distribution by Survival Status')
plt.show()

## 6. Multivariate Analysis

In [ ]:
titanic_encoded = titanic.copy()
titanic_encoded['sex'] = titanic_encoded['sex'].map({'male': 0, 'female': 1})
titanic_encoded['embarked'] = titanic_encoded['embarked'].map({'S': 0, 'C': 1, 'Q': 2})

numerical_cols = ['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare', 'sex', 'embarked']
corr_matrix = titanic_encoded[numerical_cols].corr()

sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Feature Correlation Heatmap')
plt.show()

In [ ]:
sns.pairplot(titanic[numerical_cols[:6]], hue='survived')
plt.suptitle('Pairplot of Numerical Features Colored by Survival', y=1.02)
plt.show()

In [ ]:
sns.barplot(x='pclass', y='survived', hue='sex', data=titanic)
plt.title('Survival Rate by Class and Gender')
plt.ylabel('Survival Rate')
plt.show()

In [ ]:
sns.boxplot(x='pclass', y='age', hue='survived', data=titanic)
plt.title('Age Distribution by Class and Survival Status')
plt.show()

## 7. Feature Engineering

In [ ]:
titanic['family_size'] = titanic['sibsp'] + titanic['parch'] + 1

titanic['age_group'] = pd.cut(titanic['age'], 
                             bins=[0, 12, 18, 30, 50, 100],
                             labels=['Child', 'Teen', 'Young Adult', 'Adult', 'Senior'])

titanic['fare_group'] = pd.qcut(titanic['fare'], 4,
                               labels=['Low', 'Medium', 'High', 'Very High'])

In [ ]:
titanic[['family_size', 'age_group', 'fare_group']].head()

In [ ]:
sns.barplot(x='family_size', y='survived', data=titanic)
plt.title('Survival Rate by Family Size')
plt.ylabel('Survival Rate')
plt.show()

In [ ]:
sns.barplot(x='age_group', y='survived', data=titanic)
plt.title('Survival Rate by Age Group')
plt.ylabel('Survival Rate')
plt.show()

## 8. Key Findings Summary

In [ ]:
survival_rate = titanic['survived'].mean()
female_survival = titanic[titanic['sex'] == 'female']['survived'].mean()
male_survival = titanic[titanic['sex'] == 'male']['survived'].mean()
class1_survival = titanic[titanic['pclass'] == 1]['survived'].mean()
child_survival = titanic[titanic['age_group'] == 'Child']['survived'].mean()

print(f"Overall survival rate: {survival_rate:.2%}")
print(f"Female survival rate: {female_survival:.2%}")
print(f"Male survival rate: {male_survival:.2%}")
print(f"First class survival rate: {class1_survival:.2%}")
print(f"Child survival rate: {child_survival:.2%}")

### Main Insights:
1. **Overall Survival**: Only 38% of passengers survived  
2. **Gender Difference**: Females had 74% survival vs 19% for males  
3. **Class Impact**: 1st class had 63% survival vs 24% for 3rd class  
4. **Age Factor**: Children had the highest survival rate (59%)  
5. **Family Size**: Medium family sizes (2-4) had better survival  
6. **Fare Impact**: Higher fare groups had better survival rates  
7. **Correlations**: Strongest correlations with survival were sex (0.54) and pclass (-0.34)