In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv('INDIAvi.csv')

# Display the first few rows of the dataset
print("First 5 Rows of Data:")
display(data.head())

# Step 1: Handle Missing Values
# Check for any missing values
print("\nMissing Values per Column:")
missing_values = data.isnull().sum()
display(missing_values[missing_values > 0])

# Fill missing values with the mean (numerical) or mode (categorical) as a basic approach
# Adjust this as per dataset requirements
for column in data.columns:
    if data[column].dtype == 'object':
        data[column].fillna(data[column].mode()[0], inplace=True)
    else:
        data[column].fillna(data[column].mean(), inplace=True)

# Step 2: Summary Statistics
print("\nSummary Statistics:")
display(data.describe(include='all'))  # include all columns, both numerical and categorical

print("\nDataset Info:")
data.info()

# Step 3: Univariate Analysis

# Numerical Variables: Histograms & KDE Plots
num_columns = data.select_dtypes(include=[np.number]).columns
for column in num_columns:
    plt.figure(figsize=(10, 5))
    sns.histplot(data[column], kde=True, color='skyblue')
    plt.title(f'Histogram and KDE for {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()

# Categorical Variables: Count Plots
cat_columns = data.select_dtypes(include=['object']).columns
for column in cat_columns:
    plt.figure(figsize=(10, 5))
    sns.countplot(data[column], palette="viridis")
    plt.title(f'Count Plot for {column}')
    plt.xlabel(column)
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.show()

# Step 4: Bivariate Analysis

# Scatter plots for pairs of numerical columns
sns.pairplot(data[num_columns], diag_kind="kde")
plt.suptitle("Pairplot to Explore Relationships", y=1.02)
plt.show()

# Box Plots to Explore Relationships Between Categorical and Numerical Variables
for column in cat_columns:
    for num_col in num_columns:
        plt.figure(figsize=(10, 5))
        sns.boxplot(x=data[column], y=data[num_col], palette="Set3")
        plt.title(f'Box Plot of {num_col} by {column}')
        plt.xlabel(column)
        plt.ylabel(num_col)
        plt.xticks(rotation=45)
        plt.show()

# Step 5: Correlation Analysis

# Correlation Matrix for numerical columns
plt.figure(figsize=(12, 8))
correlation_matrix = data[num_columns].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', square=True)
plt.title("Correlation Matrix of Numerical Variables")
plt.show()

# Step 6: Advanced Visualizations

# Violin Plot for Categorical vs Numerical
for column in cat_columns:
    for num_col in num_columns:
        plt.figure(figsize=(10, 5))
        sns.violinplot(x=data[column], y=data[num_col], inner="quartile", palette="pastel")
        plt.title(f'Violin Plot of {num_col} by {column}')
        plt.xlabel(column)
        plt.ylabel(num_col)
        plt.xticks(rotation=45)
        plt.show()

# Faceted Histogram for a numerical variable segmented by a categorical variable (first categorical column and numerical column)
if len(cat_columns) > 0 and len(num_columns) > 0:
    plt.figure(figsize=(10, 5))
    sns.histplot(data=data, x=num_columns[0], hue=cat_columns[0], multiple="stack", palette="viridis")
    plt.title(f'Faceted Histogram of {num_columns[0]} by {cat_columns[0]}')
    plt.xlabel(num_columns[0])
    plt.ylabel("Count")
    plt.show()

# Hypothesis Ideas:
# - Identify which numerical variables have strong correlations that might imply relationships worth exploring.
# - Investigate any apparent clusters or outliers in categorical vs numerical distributions.
# - Develop hypotheses around category distributions, mean/median differences, or variance across groups.


SyntaxError: unterminated string literal (detected at line 8) (3641966914.py, line 8)