In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load CSV file
df = pd.read_csv(r"C:/Users/DELL/Downloads/ethiopian_students_dataset.csv")

# View first 5 rows
print(df.head())

# Access a column
print(df.columns)

In [None]:
# 1. Check the basic structure
print("="*60)
print("DATA STRUCTURE ANALYSIS")
print("="*60)

print(f"Dataset Shape: {df.shape}")
print(f"Number of students: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# 2. View column categories
print("\n" + "="*60)
print("COLUMN CATEGORIES")
print("="*60)

# Group columns by type
test_score_cols = [col for col in df.columns if 'Test_Score' in col]
attendance_cols = [col for col in df.columns if 'Attendance' in col]
homework_cols = [col for col in df.columns if 'Homework' in col]
participation_cols = [col for col in df.columns if 'Participation' in col]
textbook_cols = [col for col in df.columns if 'Textbook' in col]

print(f"Test Score columns: {len(test_score_cols)}")
print(f"Attendance columns: {len(attendance_cols)}")
print(f"Homework columns: {len(homework_cols)}")
print(f"Participation columns: {len(participation_cols)}")
print(f"Textbook columns: {len(textbook_cols)}")

# 3. Check subject coverage by grade
print("\n" + "="*60)
print("SUBJECT COVERAGE BY GRADE")
print("="*60)

for grade in range(1, 13):
    grade_test_cols = [col for col in test_score_cols if f'Grade_{grade}_' in col]
    if grade_test_cols:
        subjects = list(set([col.split(f'Grade_{grade}_')[1].split('_Test')[0]
                           for col in grade_test_cols]))
        print(f"Grade {grade}: {len(grade_test_cols)} subjects - {subjects}")

all_categorized_cols = set(test_score_cols + attendance_cols + homework_cols + participation_cols + textbook_cols)
remaining_cols = [col for col in df.columns if col not in all_categorized_cols]

print("\n" + "="*60)
print("REMAINING COLUMNS")
print("="*60)
print(f"Number of remaining columns: {len(remaining_cols)}")
print("Remaining columns (first 20):\n", remaining_cols[:20])
print("Remaining columns (last 20):\n", remaining_cols[-20:])

In [None]:
# Check for nulls per column
print("==============================")
print("CHECKING FOR MISSING DATA")
print("==============================")
print(df.isnull().sum())

# Check the percentage of missing data
print("================================")
print("Percentage of missing data:")
print("--------------------------------")
print(df.isnull().mean() * 100)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Get the value counts of dtypes
dtype_counts = df.dtypes.value_counts().reset_index()
dtype_counts.columns = ['Data Type', 'Count']

# 2. Plotting with the fix
plt.figure(figsize=(10, 6))

# Fix: Assign 'Data Type' to hue and set legend=False
sns.barplot(
    data=dtype_counts,
    x='Data Type',
    y='Count',
    hue='Data Type',
    palette='viridis',
    legend=False
)

plt.title('Distribution of Data Types in Student Dataset', fontsize=14)
plt.ylabel('Number of Columns')
plt.xlabel('Data Type')

# 3. Add labels on top of bars
for i, count in enumerate(dtype_counts['Count']):
    plt.text(i, count + 5, str(count), ha='center', fontweight='bold')

plt.show()

In [None]:
#Drop ID (never use in ML)
df = df.drop(columns=['Student_ID'], errors='ignore')

#  Encode Field_Choice (VERY IMPORTANT)
df['Field_Choice'] = df['Field_Choice'].map({
    'Social': 0,
    'Natural': 1
})

# HANDLE CAREER_INTEREST
# Fill missing with "Unknown"
df['Career_Interest'] = df['Career_Interest'].fillna('Unknown')

In [None]:
# AGGREGATE GRADES INTO EDUCATION STAGES
# Define grade groups
lower_primary = ['Grade_1', 'Grade_2', 'Grade_3', 'Grade_4']
upper_primary = ['Grade_5', 'Grade_6', 'Grade_7', 'Grade_8']
secondary = ['Grade_9', 'Grade_10']
preparatory = ['Grade_11', 'Grade_12']

# Helper function to compute average test score
def stage_average(df, grades):
    cols = []
    for g in grades:
        cols += [c for c in df.columns if c.startswith(g) and c.endswith('_Test_Score')]
    return df[cols].mean(axis=1)

# Create aggregated academic scores
df['Avg_Score_Lower_Primary'] = stage_average(df, lower_primary)
df['Avg_Score_Upper_Primary'] = stage_average(df, upper_primary)
df['Avg_Score_Secondary'] = stage_average(df, secondary)
df['Avg_Score_Preparatory'] = stage_average(df, preparatory)

# Select only the newly created columns and view the first 5 rows
textbook_summary_cols = [
    'Avg_Score_Lower_Primary',
    'Avg_Score_Upper_Primary',
    'Avg_Score_Secondary',
    'Avg_Score_Preparatory'
]

print("Aggeregated average Scores(1-4,5-8,9&10 and 11&12) per Education Level (Head):")
print(df[textbook_summary_cols].head())

import seaborn as sns
import matplotlib.pyplot as plt

# Visualizing the distribution of the four new columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=df[textbook_summary_cols])
plt.title('Aggeregated average Distribution by Education Level')
plt.ylabel('Aggeregated average score')
plt.xticks(rotation=15)
plt.show()

In [None]:
# CREATE ENGAGEMENT SCORES
# Helper function for engagement
def engagement_score(df, grades):
    attendance = []
    homework = []
    participation = []

    for g in grades:
        attendance += [c for c in df.columns if c.startswith(g) and 'Attendance' in c]
        homework += [c for c in df.columns if c.startswith(g) and 'Homework' in c]
        participation += [c for c in df.columns if c.startswith(g) and 'Participation' in c]

    return df[attendance + homework + participation].mean(axis=1)

# Create engagement features
df['Engagement_1_4'] = engagement_score(df, lower_primary)
df['Engagement_5_8'] = engagement_score(df, upper_primary)
df['Engagement_9_10'] = engagement_score(df, secondary)
df['Engagement_11_12'] = engagement_score(df, preparatory)

# -------------------------------
# Combine all Engagement scores into one overall column
# -------------------------------
engagement_cols = [
    'Engagement_1_4',
    'Engagement_5_8',
    'Engagement_9_10',
    'Engagement_11_12'
]
"""
# Create overall engagement column
df['Engagement_All'] = df[engagement_cols].mean(axis=1)

# Check the first 5 rows
print("Overall Engagement Scores (Head):")
print(df[['Engagement_All']].head())

# Optional: Visualize the overall engagement
plt.figure(figsize=(8, 5))
sns.histplot(df['Engagement_All'], kde=True, bins=20)
plt.title('Overall Engagement Score Distribution')
plt.xlabel('Engagement Score (0 to 1)')
plt.ylabel('Count')
plt.show()
"""
print("Engagement(attendance,homework,participation) Scores per Education Level (Head):")
print(df[engagement_cols].head())

# Visualizing the distribution of the four new columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=df[engagement_cols])
plt.title('Engagement(attendance,homework,participation) Distribution by Education Level')
plt.ylabel('Average of enggegment')
plt.xticks(rotation=15)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --------------------------------------------------
# 1Ô∏è‚É£ Pandas future-proof setting
# --------------------------------------------------
pd.set_option('future.no_silent_downcasting', True)

# --------------------------------------------------
# 2Ô∏è‚É£ Convert Yes/No ‚Üí 1/0 SAFELY
# --------------------------------------------------
textbook_cols = [c for c in df.columns if 'Textbook' in c]

for col in textbook_cols:
    df[col] = (
        df[col]
        .replace({'Yes': 1, 'No': 0})
        .infer_objects(copy=False)
    )

# --------------------------------------------------
# 3Ô∏è‚É£ Textbook access function
# --------------------------------------------------
def textbook_access(df, grade_prefixes):
    cols = []
    for g in grade_prefixes:
        cols.extend([c for c in df.columns if c.startswith(g) and 'Textbook' in c])
    return df[cols].mean(axis=1) if len(cols) > 0 else pd.Series(0, index=df.index)

# --------------------------------------------------
# 4Ô∏è‚É£ Create summary columns (NO fragmentation)
# --------------------------------------------------
new_cols_df = pd.DataFrame({
    'Textbook_Access_1_4': textbook_access(df, lower_primary),
    'Textbook_Access_5_8': textbook_access(df, upper_primary),
    'Textbook_Access_9_10': textbook_access(df, secondary),
    'Textbook_Access_11_12': textbook_access(df, preparatory)
})

df = pd.concat([df, new_cols_df], axis=1)

# --------------------------------------------------
# 5Ô∏è‚É£ REMOVE duplicate columns (CRITICAL FIX)
# --------------------------------------------------
df = df.loc[:, ~df.columns.duplicated()]

# --------------------------------------------------
# 6Ô∏è‚É£ Select valid summary columns SAFELY
# --------------------------------------------------
textbook_summary_cols = [
    c for c in new_cols_df.columns
    if c in df.columns and df[c].notna().any()
]

print(df[textbook_summary_cols].head().to_string(index=False))

# --------------------------------------------------
# 7Ô∏è‚É£ Visualization (NO warnings, NO errors)
# --------------------------------------------------
plt.figure(figsize=(10, 6))
sns.boxplot(data=df[textbook_summary_cols])
plt.title('Textbook Access Distribution by Education Level', fontsize=14, fontweight='bold')
plt.ylabel('Access Score (0 to 1)')
plt.xticks(rotation=15)
plt.grid(alpha=0.3)
plt.show()

In [None]:
# HANDLE TRACK-BASED NATIONAL EXAMS (CRITICAL PART)
# Social Science track exam score
social_subjects = [
    'National_Exam_History',
    'National_Exam_Geography',
    'National_Exam_Economics',
    'National_Exam_Math_Social'
]

natural_subjects = [
    'National_Exam_Biology',
    'National_Exam_Chemistry',
    'National_Exam_Physics',
    'National_Exam_Math_Natural'
]

df['Social_Track_Subject_Avg'] = df[social_subjects].mean(axis=1)
df['Natural_Track_Subject_Avg'] = df[natural_subjects].mean(axis=1)


df['Track_Subject_Average'] = np.where(
    df['Field_Choice'] == 0,
    df['Social_Track_Subject_Avg'],
    df['Natural_Track_Subject_Avg']
)

common_subjects = [
    'National_Exam_Aptitude',
    'National_Exam_English',
    'National_Exam_Civics_and_Ethical_Education'
]

df['Common_Exam_Average'] = df[common_subjects].mean(axis=1)

df['Track_Exam_Average'] = (
    df['Common_Exam_Average'] + df['Track_Subject_Average']
) / 2

# --- VISUALIZATION ---

# Set style
sns.set_theme(style="whitegrid")
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Visualization 1: Comparison of Common vs. Track Performance
sns.boxplot(data=df[['Common_Exam_Average', 'Track_Subject_Average', 'Track_Exam_Average']],
            ax=axes[0], palette="Set2")
axes[0].set_title('Distribution of Aggregate Exam Scores')
axes[0].set_ylabel('Score (0-100)')

# Visualization 2: Final Performance by Field Choice (Density)
for choice, label in [(0, 'Social Science'), (1, 'Natural Science')]:
    subset = df[df['Field_Choice'] == choice]
    sns.kdeplot(subset['Track_Exam_Average'], ax=axes[1], label=label, fill=True)

axes[1].set_title('Track Exam Average: Social vs. Natural')
axes[1].set_xlabel('Score')
axes[1].legend()

plt.tight_layout()
plt.show()

# 4. Show the Head (FIXED)
exam_cols = [
    'Social_Track_Subject_Avg',
    'Natural_Track_Subject_Avg',
    'Track_Subject_Average',
    'Common_Exam_Average',
    'Track_Exam_Average'
]

print("New Aggregated National Exam Features:")
print(df[exam_cols].head())

In [None]:
# DROP ORIGINAL HIGH-DIMENSION COLUMNS
drop_cols = [c for c in df.columns if c.startswith('Grade_')]
drop_cols += [c for c in df.columns if c.startswith('National_Exam_')]

df = df.drop(columns=drop_cols)
# -------------------------------
# 0Ô∏è‚É£ Drop leaking exam average columns
# -------------------------------
leak_cols = [
    'Social_Track_Subject_Avg',
    'Natural_Track_Subject_Avg',
    'Track_Exam_Average',
    'Track_Subject_Average',
    'Common_Exam_Average',
    'Avg_Score_Secondary',
    'Avg_Score_Preparatory',
    'Avg_Score_Lower_Primary',
    'Avg_Score_Upper_Primary',
    'School_ID', 'Total_Test_Score','"Total_National_Exam_Score",']

df = df.drop(columns=[c for c in leak_cols if c in df.columns])

# fix null value
df['Health_Issue'] = df['Health_Issue'].fillna('No Issue')
df['Father_Education'] = df['Father_Education'].fillna('Unknown')
df['Mother_Education'] = df['Mother_Education'].fillna('Unknown')

# FINAL CHECK
print(df.shape)
print(df.head())
print("all columns:",df.columns)

In [None]:
TARGET = 'Overall_Average'

# Select numeric columns only
num_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Compute correlations with target
corr_numeric = (
    df[num_cols]
    .corr()[TARGET]
    .drop(TARGET)
    .sort_values(key=abs, ascending=False)
)

print("üìä Numeric Feature Correlation with Target:")
print(corr_numeric)

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 8))

sns.barplot(
    x=corr_numeric.values,
    y=corr_numeric.index,
    hue=corr_numeric.index,
    palette='coolwarm',
    legend=False
)

plt.axvline(0, color='black', linewidth=1)
plt.title('Correlation with Total_National_Exam_Score')
plt.xlabel('Pearson Correlation')
plt.tight_layout()
plt.show()

In [None]:
cat_cols = df.select_dtypes(include='object').columns.drop(TARGET, errors='ignore')

cat_corr = {}

for col in cat_cols:
    means = df.groupby(col)[TARGET].mean()
    encoded = df[col].map(means)
    cat_corr[col] = encoded.corr(df[TARGET])

cat_corr = (
    pd.Series(cat_corr)
    .sort_values(key=abs, ascending=False)
)

print("üìä Categorical Feature Association with Target:")
print(cat_corr)