# Exploratory Data Analysis (EDA)

This notebook is used for performing exploratory data analysis on the dataset. It includes data loading, visualization, and initial insights.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style='whitegrid')

In [2]:
# Load the dataset
data_path = '../data/processed/your_processed_data.csv'  # Update with your processed data path
df = pd.read_csv(data_path)

# Display the first few rows of the dataframe
df.head()

In [3]:
# Summary statistics
df.describe()

In [4]:
# Visualize distributions of numerical features
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns
for feature in numerical_features:
    plt.figure(figsize=(10, 6))
    sns.histplot(df[feature], bins=30, kde=True)
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.show()

In [5]:
# Correlation heatmap
plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)
plt.title('Correlation Heatmap')
plt.show()