In [5]:
# === 1. Load Raw Data ===
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('../data/raw/student-data-raw.csv')

# === 2. Basic Inspection (Raw Data) ===
print("Shape:", df.shape)
print("\nData types:\n", df.dtypes)
print("\nMissing values:\n", df.isnull().sum())
print("\nDuplicates:", df.duplicated().sum())

# === 3. Class Balance (Target Variable) ===
plt.figure(figsize=(6, 4))
sns.countplot(x='passed', data=df, order=['no', 'yes'])
plt.title('Class Distribution (Raw Data)')
plt.savefig('../docs/raw_target_dist.png')
plt.close()  # Prevents display

# === 4. Numeric Features Analysis ===
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
print("\nDescriptive Statistics (Numeric Features):\n", df[numeric_cols].describe())
for col in numeric_cols:
    # Histogram
    plt.figure(figsize=(8, 4))
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col} (Raw Data)')
    plt.savefig(f'../docs/raw_{col}_hist.png')
    plt.close()
    
    # Boxplot (to identify outliers)
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col} (Raw Data)')
    plt.savefig(f'../docs/raw_{col}_boxplot.png')
    plt.close()

# === 5. Categorical Features Analysis ===
categorical_cols = df.select_dtypes(include=['object']).columns.drop('passed')
for col in categorical_cols:
    plt.figure(figsize=(10, 4))
    order = df[col].value_counts().index  # Sort by frequency
    sns.countplot(x=col, data=df, order=order)
    plt.title(f'Distribution of {col} (Sorted by Frequency)')
    plt.xticks(rotation=45)
    plt.savefig(f'../docs/raw_{col}_countplot.png')
    plt.close()

# === 6. Correlation Analysis (Numeric Features Only) ===
plt.figure(figsize=(12, 8))
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix (Raw Data)')
plt.savefig('../docs/raw_correlation_matrix.png')
plt.close()

# === 7. Generate EDA Report (Improved) ===
print("\n=== EDA FINDINGS (RAW DATA) ===")
print(f"1. Dataset shape: {df.shape}")
print(f"2. Missing values: {df.isnull().sum().sum()} (None expected in this dataset)")
print(f"3. Duplicates: {df.duplicated().sum()}")
print(f"4. Class balance (passed):\n{df['passed'].value_counts(normalize=True)}")
print("5. Numeric features with outliers (check boxplots): 'absences', 'failures'")
print("6. High-cardinality categoricals (check countplots): 'Mjob', 'Fjob'")

# Optional: Suggest grouping for preprocessing
print("\n=== PREPROCESSING SUGGESTIONS ===")
print("1. Consider grouping rare categories in 'Mjob'/'Fjob' (e.g., 'health' + 'services').")
print("2. Clip 'absences' (max=32 is an outlier; 75th percentile=6).")

Shape: (395, 31)

Data types:
 school        object
sex           object
age            int64
address       object
famsize       object
Pstatus       object
Medu           int64
Fedu           int64
Mjob          object
Fjob          object
reason        object
guardian      object
traveltime     int64
studytime      int64
failures       int64
schoolsup     object
famsup        object
paid          object
activities    object
nursery       object
higher        object
internet      object
romantic      object
famrel         int64
freetime       int64
goout          int64
Dalc           int64
Walc           int64
health         int64
absences       int64
passed        object
dtype: object

Missing values:
 school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          