# Data Exploration Notebook

This notebook provides an example of how to explore and analyze data using the AI/ML Project Template.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import project modules
import sys
sys.path.append('../src')

from data.ingestion import load_data
from data.preprocessing import explore_data

# Set plotting style
plt.style.use('seaborn')
sns.set_palette('deep')

# Configure pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

In [None]:
# Load data
# Replace 'your_data.csv' with the actual path to your data file
df = load_data('../data/raw/your_data.csv')

# Display basic information about the dataset
print(f'Dataset shape: {df.shape}')
print(f'Dataset memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB')
df.head()

In [None]:
# Display dataset information
df.info()

In [None]:
# Statistical summary of numerical columns
df.describe()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percent = 100 * missing_values / len(df)

missing_data = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percent
}).sort_values('Percentage', ascending=False)

missing_data[missing_data['Missing Values'] > 0]

In [None]:
# Visualize missing values
plt.figure(figsize=(12, 6))
sns.heatmap(df.isnull(), cbar=True, yticklabels=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()

In [None]:
# Distribution of numerical features
numerical_columns = df.select_dtypes(include=[np.number]).columns.tolist()

fig, axes = plt.subplots(len(numerical_columns), 2, figsize=(15, 5*len(numerical_columns)))

for i, column in enumerate(numerical_columns):
    # Histogram
    sns.histplot(df[column], kde=True, ax=axes[i, 0])
    axes[i, 0].set_title(f'Distribution of {column}')
    
    # Box plot
    sns.boxplot(y=df[column], ax=axes[i, 1])
    axes[i, 1].set_title(f'Box Plot of {column}')

plt.tight_layout()
plt.show()

In [None]:
# Distribution of categorical features
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

for column in categorical_columns:
    plt.figure(figsize=(12, 6))
    value_counts = df[column].value_counts()
    sns.barplot(x=value_counts.index, y=value_counts.values)
    plt.title(f'Distribution of {column}')
    plt.xticks(rotation=45)
    plt.show()
    
    print(f'
{column} value counts:')
    print(value_counts)

In [None]:
# Correlation matrix
correlation_matrix = df.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Pairplot for selected features (select a subset if you have many features)
selected_features = numerical_columns[:5]  # Adjust as needed

sns.pairplot(df[selected_features])
plt.show()

In [None]:
# Save cleaned data (optional)
# df.to_csv('../data/processed/cleaned_data.csv', index=False)

print('Data exploration completed!')