In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv('INDIAvi.csv')

# Display the first few rows of the dataset
print("First 5 Rows of Data:")
display(data.head())

# Step 1: Handle Missing Values
# Check for any missing values
print("\nMissing Values per Column:")
missing_values = data.isnull().sum()
display(missing_values[missing_values > 0])

# Fill missing values with the mean (numerical) or mode (categorical) without using inplace=True
for column in data.columns:
    if data[column].dtype == 'object':
        data[column] = data[column].fillna(data[column].mode()[0])
    else:
        data[column] = data[column].fillna(data[column].mean())

# Step 2: Summary Statistics
print("\nSummary Statistics:")
display(data.describe(include='all'))

print("\nDataset Info:")
data.info()

# Step 3: Univariate Analysis

# Numerical Variables: High-Bin Histograms & KDE Plots
num_columns = data.select_dtypes(include=[np.number]).columns
for column in num_columns:
    plt.figure(figsize=(12, 6))
    sns.histplot(data[column], bins=50, kde=True, color='skyblue')  # Increased bins to 50
    plt.title(f'Histogram and KDE for {column} (Detailed)')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()

# Categorical Variables: Count Plots with all Values Visible
cat_columns = data.select_dtypes(include=['object']).columns
for column in cat_columns:
    plt.figure(figsize=(12, 6))
    sns.countplot(data[column], palette="viridis", order=data[column].value_counts().index)
    plt.title(f'Count Plot for {column} (Detailed)')
    plt.xlabel(column)
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.show()

# Step 4: Bivariate Analysis

# Pairplot for Detailed Relationships
sns.pairplot(data[num_columns], diag_kind="kde", plot_kws={'alpha':0.5, 's':10})  # Increased alpha for transparency
plt.suptitle("Pairplot to Explore Relationships (Detailed)", y=1.02)
plt.show()

# Box Plots with all Values in Categorical vs Numerical Variables
for column in cat_columns:
    for num_col in num_columns:
        plt.figure(figsize=(12, 6))
        sns.boxplot(x=data[column], y=data[num_col], palette="Set3", width=0.6)
        sns.stripplot(x=data[column], y=data[num_col], color='black', size=2, jitter=True, alpha=0.5)  # Add jittered data points
        plt.title(f'Box Plot of {num_col} by {column} (Detailed)')
        plt.xlabel(column)
        plt.ylabel(num_col)
        plt.xticks(rotation=45)
        plt.show()

# Step 5: Correlation Analysis

# Detailed Correlation Heatmap
plt.figure(figsize=(14, 10))
correlation_matrix = data[num_columns].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', square=True, annot_kws={"size": 8})
plt.title("Correlation Matrix of Numerical Variables (Detailed)")
plt.show()

# Step 6: Advanced Visualizations

# Violin Plot with Detailed View for Categorical vs Numerical
for column in cat_columns:
    for num_col in num_columns:
        plt.figure(figsize=(12, 6))
        sns.violinplot(x=data[column], y=data[num_col], inner="quartile", palette="pastel")
        sns.stripplot(x=data[column], y=data[num_col], color='black', size=2, jitter=True, alpha=0.5)  # Add points for all values
        plt.title(f'Violin Plot of {num_col} by {column} (Detailed)')
        plt.xlabel(column)
        plt.ylabel(num_col)
        plt.xticks(rotation=45)
        plt.show()

# Faceted Histogram for Detailed View
if len(cat_columns) > 0 and len(num_columns) > 0:
    plt.figure(figsize=(12, 6))
    sns.histplot(data=data, x=num_columns[0], hue=cat_columns[0], multiple="stack", bins=50, palette="viridis")
    plt.title(f'Faceted Histogram of {num_columns[0]} by {cat_columns[0]} (Detailed)')
    plt.xlabel(num_columns[0])
    plt.ylabel("Count")
    plt.show()

# Hypothesis Ideas:
# - Identify which numerical variables have strong correlations that might imply relationships worth exploring.
# - Investigate any apparent clusters or outliers in categorical vs numerical distributions.
# - Develop hypotheses around category distributions, mean/median differences, or variance across groups.

