# Exploratory Data Analysis (EDA)

## 1. Introduction
This notebook analyzes the student dropout dataset.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats


## 2. Load Dataset


In [None]:
df = pd.read_csv('../data/student_dropout_1000.csv')
df.head()


## 3. Data Overview
Shape, Info, and duplicates.


In [None]:
print(f'Shape: {df.shape}')
df.info()
print(f'Duplicates: {df.duplicated().sum()}')


## 4. Null Value Report


In [None]:
df.isnull().sum()


## 5. Statistical Summary


In [None]:
df.describe()


## 6. Correlation Heatmap


In [None]:
plt.figure(figsize=(12, 10))
numeric_df = df.select_dtypes(include=[np.number])
sns.heatmap(numeric_df.corr(), annot=False, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


## 7. Distribution Analysis


In [None]:
df.hist(figsize=(15, 15), bins=20)
plt.tight_layout()
plt.show()


## 8. Statistical Hypothesis Tests

### T-Test
Testing if there is a significant difference in a feature between Dropouts and Non-Dropouts.


In [None]:

# Assuming 'Target' column exists, else detecting it
target = 'Target' if 'Target' in df.columns else df.columns[-1]

# Identify a numeric column for t-test
numeric_cols = df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
    col_to_test = numeric_cols[0]
    group1 = df[df[target] == 'Dropout'][col_to_test]
    group2 = df[df[target] != 'Dropout'][col_to_test] # Graduate/Enrolled

    # If groups are empty (encoding might be needed or target values differ), skip or handle
    if len(group1) > 0 and len(group2) > 0:
        t_stat, p_val = stats.ttest_ind(group1, group2)
        print(f"T-Test for {col_to_test}: Stat={t_stat}, P-value={p_val}")



### Chi-Square Test
Testing independence for categorical features.


In [None]:

# Identify a categorical column
cat_cols = df.select_dtypes(include=['object']).columns
if len(cat_cols) > 1: # Need feature + target
    col_test = cat_cols[0] if cat_cols[0] != target else cat_cols[1]
    contingency_table = pd.crosstab(df[col_test], df[target])
    chi2, p, dof, ex = stats.chi2_contingency(contingency_table)
    print(f"Chi-Square Test for {col_test}: Stat={chi2}, P-value={p}")

