In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
data = pd.read_csv(url)

# Display the first few rows of the dataset
print(data.head())

# Data Cleaning
# Checking for missing values
print(data.isnull().sum())

# Filling missing values
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
data.drop(columns=['Cabin'], inplace=True)

# Verify that missing values are handled
print(data.isnull().sum())

# Exploratory Data Analysis (EDA)
# Summary statistics
print(data.describe())

# Visualize the distribution of numerical variables
plt.figure(figsize=(10, 6))
sns.histplot(data['Age'], kde=True)
plt.title('Age Distribution')
plt.show()

plt.figure(figsize=(10, 6))
sns.histplot(data['Fare'], kde=True)
plt.title('Fare Distribution')
plt.show()

# Bar plot for categorical variables
plt.figure(figsize=(10, 6))
sns.countplot(x='Sex', data=data)
plt.title('Gender Distribution')
plt.show()

plt.figure(figsize=(10, 6))
sns.countplot(x='Pclass', data=data)
plt.title('Passenger Class Distribution')
plt.show()

plt.figure(figsize=(10, 6))
sns.countplot(x='Embarked', data=data)
plt.title('Port of Embarkation Distribution')
plt.show()

# Explore relationships between variables
# Survival rate by gender
plt.figure(figsize=(10, 6))
sns.barplot(x='Sex', y='Survived', data=data)
plt.title('Survival Rate by Gender')
plt.show()

# Survival rate by passenger class
plt.figure(figsize=(10, 6))
sns.barplot(x='Pclass', y='Survived', data=data)
plt.title('Survival Rate by Passenger Class')
plt.show()

# Survival rate by age
plt.figure(figsize=(10, 6))
sns.histplot(data[data['Survived'] == 1]['Age'], kde=True, color='green', label='Survived')
sns.histplot(data[data['Survived'] == 0]['Age'], kde=True, color='red', label='Not Survived')
plt.title('Survival Rate by Age')
plt.legend()
plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()