# Exploratory Data Analysis (EDA) Template

This notebook provides a reusable template for performing EDA on any dataset, particularly suitable for Kaggle datasets.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

sns.set(style='whitegrid')
%matplotlib inline

## Load the Dataset

In [None]:
# Replace 'your_dataset.csv' with your dataset file
df = pd.read_csv('your_dataset.csv')
df.head()

## Basic Dataset Information

In [None]:
df.info()
df.describe().T

## Missing Values

In [None]:
missing = df.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(ascending=False)

## Data Types Overview

In [None]:
df.dtypes.value_counts()

## Data Cleaning and Preprocessing

### Handle Duplicate Rows
print(f'Number of duplicate rows: {df.duplicated().sum()}')
df.drop_duplicates(inplace=True)
print(f'Number of rows after dropping duplicates: {df.shape[0]}')

### Outlier Detection and Treatment (Numerical Features)
# Using Box Plots for visualization
for col in num_cols:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=df[col])
    plt.title(f'Box Plot of {col}')
    plt.show()

# Example: Outlier treatment using IQR method (for a specific column)
# Q1 = df['your_numerical_column'].quantile(0.25)
# Q3 = df['your_numerical_column'].quantile(0.75)
# IQR = Q3 - Q1
# lower_bound = Q1 - 1.5 * IQR
# upper_bound = Q3 + 1.5 * IQR
# df = df[(df['your_numerical_column'] >= lower_bound) & (df['your_numerical_column'] <= upper_bound)]

### Data Type Conversion (if needed)
# Example: df['column_name'] = pd.to_datetime(df['column_name'])

## Advanced Univariate Analysis

### Numerical Feature Statistics
print('\nSkewness and Kurtosis for Numerical Features:')
print(df[num_cols].skew())
print(df[num_cols].kurt())

### Categorical Feature Distributions
for col in cat_cols:
    plt.figure(figsize=(10, 6))
    sns.countplot(y=df[col], order=df[col].value_counts().index, palette='viridis')
    plt.title(f'Distribution of {col}')
    plt.xlabel('Count')
    plt.ylabel(col)
    plt.show()
    print(f'\nPercentage distribution for {col}:')
    print(df[col].value_counts(normalize=True) * 100)

## Bivariate Analysis

### Numerical vs. Numerical Features
# Pair plot for selected numerical features
if len(num_cols) > 1:
    sns.pairplot(df[num_cols[:min(5, len(num_cols))]]) # Limiting to 5 for readability
    plt.suptitle('Pair Plot of Numerical Features', y=1.02)
    plt.show()

# Scatter plots with regression line
if len(num_cols) > 1:
    plt.figure(figsize=(10, 6))
    sns.regplot(x=df[num_cols[0]], y=df[num_cols[1]])
    plt.title(f'Scatter Plot with Regression Line: {num_cols[0]} vs {num_cols[1]}')
    plt.show()

### Categorical vs. Numerical Features
for cat_col in cat_cols:
    for num_col in num_cols:
        plt.figure(figsize=(10, 6))
        sns.boxplot(x=df[cat_col], y=df[num_col], palette='pastel')
        plt.title(f'{num_col} by {cat_col}')
        plt.xticks(rotation=45)
        plt.show()
        
        plt.figure(figsize=(10, 6))
        sns.violinplot(x=df[cat_col], y=df[num_col], palette='pastel')
        plt.title(f'Violin Plot of {num_col} by {cat_col}')
        plt.xticks(rotation=45)
        plt.show()

### Categorical vs. Categorical Features
for i in range(len(cat_cols)):
    for j in range(i + 1, len(cat_cols)):
        cat1 = cat_cols[i]
        cat2 = cat_cols[j]
        
        # Cross-tabulation
        print(f'\nCross-tabulation of {cat1} and {cat2}:')
        cross_tab = pd.crosstab(df[cat1], df[cat2])
        print(cross_tab)
        
        # Stacked Bar Plot
        cross_tab.plot(kind='bar', stacked=True, figsize=(10, 6))
        plt.title(f'Stacked Bar Plot of {cat1} by {cat2}')
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        plt.show()

## Multivariate Analysis

### Pair Plot with Hue
# Example: Pair plot with a categorical variable as hue
if len(num_cols) > 1 and len(cat_cols) > 0:
    sns.pairplot(df[num_cols[:min(4, len(num_cols))].tolist() + [cat_cols[0]]], hue=cat_cols[0])
    plt.suptitle('Pair Plot with Categorical Hue', y=1.02)
    plt.show()

### 3D Scatter Plot (requires more than 2 numerical features)
if len(num_cols) >= 3:
    fig = px.scatter_3d(df, x=num_cols[0], y=num_cols[1], z=num_cols[2],
                        color=cat_cols[0] if len(cat_cols) > 0 else None,
                        title='3D Scatter Plot')
    fig.show()

## Time Series Analysis (Optional)

### Convert to Datetime (if applicable)
# df['date_column'] = pd.to_datetime(df['date_column'])
# df.set_index('date_column', inplace=True)

### Resampling and Aggregation
# Example: Daily mean of a numerical column
# df['numerical_column'].resample('D').mean().plot(figsize=(12, 6))
# plt.title('Daily Mean of Numerical Column')
# plt.show()

### Trend and Seasonality
# from statsmodels.tsa.seasonal import seasonal_decompose
# result = seasonal_decompose(df['numerical_column'].dropna(), model='additive', period=365) # Adjust period as needed
# result.plot()
# plt.show()

### Autocorrelation and Partial Autocorrelation Plots
# from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# plot_acf(df['numerical_column'].dropna(), lags=50)
# plt.show()
# plot_pacf(df['numerical_column'].dropna(), lags=50)
# plt.show()

## Feature Engineering (Basic)

### Creating New Features
# Example: Combining existing features
# df['new_feature'] = df['feature1'] + df['feature2']

# Example: Extracting information from datetime columns
# df['year'] = df.index.year
# df['month'] = df.index.month
# df['day_of_week'] = df.index.dayofweek

## Summary of Findings

### Key Insights
*   Summarize the main observations from the EDA.
*   Mention any significant patterns, correlations, or anomalies found.
*   Discuss the quality of the data (missing values, outliers, data types).

### Next Steps
*   Suggest potential feature engineering ideas.
*   Propose modeling approaches based on insights.
*   Recommend further data collection or cleaning if necessary.

## Distribution of Numerical Features

In [None]:
num_cols = df.select_dtypes(include=np.number).columns
df[num_cols].hist(figsize=(15, 10), bins=30, edgecolor='black')
plt.tight_layout()

## Correlation Heatmap

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df[num_cols].corr(), annot=True, fmt='.2f', cmap='coolwarm')

## Feature Distributions with Seaborn

In [None]:
# Example: Distribution of a specific column
sns.histplot(df[num_cols[0]], kde=True)

## Interactive Visualization with Plotly

In [None]:
# Example: Interactive scatter plot
px.scatter(df, x=num_cols[0], y=num_cols[1],
           color=num_cols[2] if len(num_cols) > 2 else None)

## Categorical Features Analysis

In [None]:
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    print(f'\nValue counts for {col}:')
    print(df[col].value_counts())