In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display, HTML

# Load the dataset
titanic = pd.read_csv('train.csv')

# Display the first few rows of the dataset
html_text = "<h1>Sample Data from the Dataset</h1>"
display(HTML(html_text))
display(titanic.head().style.set_table_styles(
    [
        {'selector': 'thead th', 'props': [('background-color', '#4CAF50'), ('color', 'white'), ('font-size', '14px')]},
        {'selector': 'tbody tr:nth-child(odd)', 'props': [('background-color', '#f2f2f2')]},
        {'selector': 'tbody tr:nth-child(even)', 'props': [('background-color', 'white')]},
        {'selector': 'td', 'props': [('padding', '10px'), ('text-align', 'center'), ('border', '1px solid #ddd')]},
    ]
).set_properties(**{'font-family': 'Arial', 'font-size': '12px'}))

# Data Cleaning
# Check for missing values
html_text = "<h1>Missing Values</h1>"
display(HTML(html_text))
missing_values = titanic.isnull().sum()

# Create a DataFrame for missing values
missing_df = pd.DataFrame({'Column Name': missing_values.index, 'Missing values': missing_values.values})
display(missing_df.style.set_table_styles(
    [
        {'selector': 'thead th', 'props': [('background-color', '#4CAF50'), ('color', 'white'), ('font-size', '14px')]},
        {'selector': 'tbody tr:nth-child(odd)', 'props': [('background-color', '#f2f2f2')]},
        {'selector': 'tbody tr:nth-child(even)', 'props': [('background-color', 'white')]},
        {'selector': 'td', 'props': [('padding', '10px'), ('text-align', 'center'), ('border', '1px solid #ddd')]},
    ]
).set_properties(**{'font-family': 'Arial', 'font-size': '12px'}))

# Fill missing values
titanic['Age'].fillna(titanic['Age'].median(), inplace=True)
titanic['Embarked'].fillna(titanic['Embarked'].mode()[0], inplace=True)
titanic.drop(columns=['Cabin'], inplace=True)  # Dropping 'Cabin' due to too many missing values

# Verify that there are no more missing values
html_text = "<h1>Missing Values after cleaning</h1>"
display(HTML(html_text))
missing_values = titanic.isnull().sum()

# Create a DataFrame for missing values
missing_df = pd.DataFrame({'Column Name': missing_values.index, 'Missing values': missing_values.values})

display(missing_df.style.set_table_styles(
    [
        {'selector': 'thead th', 'props': [('background-color', '#4CAF50'), ('color', 'white'), ('font-size', '14px')]},
        {'selector': 'tbody tr:nth-child(odd)', 'props': [('background-color', '#f2f2f2')]},
        {'selector': 'tbody tr:nth-child(even)', 'props': [('background-color', 'white')]},
        {'selector': 'td', 'props': [('padding', '10px'), ('text-align', 'center'), ('border', '1px solid #ddd')]},
    ]
).set_properties(**{'font-family': 'Arial', 'font-size': '12px'}))


# Exploratory Data Analysis (EDA)
# Summary statistics
html_text = "<h1>Descriptive Statistics</h1>"
display(HTML(html_text))
display(titanic.describe().style.set_table_styles(
    [
        {'selector': 'thead th', 'props': [('background-color', '#4CAF50'), ('color', 'white'), ('font-size', '14px')]},
        {'selector': 'tbody tr:nth-child(odd)', 'props': [('background-color', '#f2f2f2')]},
        {'selector': 'tbody tr:nth-child(even)', 'props': [('background-color', 'white')]},
        {'selector': 'td', 'props': [('padding', '10px'), ('text-align', 'center'), ('border', '1px solid #ddd')]},
    ]
).set_properties(**{'font-family': 'Arial', 'font-size': '12px'}))
html_text = "<h1>Exploratory Data Analysis (EDA)</h1>"
display(HTML(html_text))
# Visualize the distribution of numerical features
sns.histplot(titanic['Age'], kde=True)
plt.title('Age Distribution')
plt.show()

sns.histplot(titanic['Fare'], kde=True)
plt.title('Fare Distribution')
plt.show()

# Visualize the relationship between variables
sns.countplot(x='Survived', data=titanic)
plt.title('Survival Count')
plt.show()

sns.countplot(x='Pclass', hue='Survived', data=titanic)
plt.title('Survival Count by Passenger Class')
plt.show()

sns.countplot(x='Sex', hue='Survived', data=titanic)
plt.title('Survival Count by Gender')
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(titanic.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


ImportError: Missing optional dependency 'Jinja2'. DataFrame.style requires jinja2. Use pip or conda to install Jinja2.