In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Set plot style
sns.set(style="whitegrid")

# Load data
df = pd.read_csv('../data/student_data.csv')

# Show first few rows
print(" First 5 rows:")
display(df.head())

# Dataset shape
print(f"\n Dataset shape: {df.shape}")

# Data types and nulls
print("\n Info:")
df.info()

# Null value count
print("\n Missing values:")
print(df.isnull().sum())

# Handle missing values (optional: drop or fill)
df.dropna(inplace=True)  # You can replace with df.fillna(method='ffill') if needed

# Describe numeric columns
print("\n Statistics:")
display(df.describe())

# Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm", fmt=".2f")
plt.title(" Correlation Heatmap")
plt.show()

# Create pass/fail label (Target)
df['pass_fail'] = df['G3'].apply(lambda x: 1 if x >= 10 else 0)

# Plot target distribution
sns.countplot(x='pass_fail', data=df, palette='Set2')
plt.title(" Pass/Fail Distribution")
plt.xlabel("Target (0 = Fail, 1 = Pass)")
plt.ylabel("Count")
plt.show()

# Class balance ratio
print("\n Class Balance:")
print(df['pass_fail'].value_counts(normalize=True))

# Visualize numeric feature distributions
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
df[numeric_cols].hist(figsize=(15, 10), bins=20, color='skyblue', edgecolor='black')
plt.suptitle(" Feature Distributions", fontsize=16)
plt.show()

# Pairplot for selected features
selected_features = ['G1', 'G2', 'absences', 'failures']
sns.pairplot(df, vars=selected_features, hue='pass_fail', palette='husl')
plt.suptitle(" Pairplot of Selected Features", y=1.02)
plt.show()

# Optional: Encode categorical variables
df_encoded = pd.get_dummies(df, drop_first=True)

# Final dataset shape
print(f"\n Final dataset shape after encoding: {df_encoded.shape}")

ModuleNotFoundError: No module named 'pandas'