# Descriptive and Summary Statistics


Establish a **baseline statistical understanding** of Philippine public school teachers and students, including distributions, central tendencies, and data quality checks. This notebook serves as the foundation for all subsequent analyses.

In [None]:
# Core libraries
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
pd.set_option("display.max_columns", None)
sns.set(style="whitegrid")

In [None]:
# Dataset source:
# https://www.kaggle.com/datasets/franksebastiancayaco/philippine-public-school-teachers-and-students

DATA_PATH = "../data/raw/philippine_public_school_teachers_students.csv"

df = pd.read_csv(DATA_PATH)

df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.columns.tolist()

In [None]:
missing_summary = (
    df.isnull()
      .sum()
      .to_frame("Missing_Count")
      .assign(Missing_Percent=lambda x: (x["Missing_Count"] / len(df)) * 100)
      .sort_values("Missing_Percent", ascending=False)
)

missing_summary

In [None]:
plt.figure(figsize=(10, 5))
sns.heatmap(df.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Data Heatmap")
plt.show()

In [None]:
# Convert year columns if needed
for col in df.columns:
    if "year" in col.lower():
        df[col] = df[col].astype(str)

# Ensure numeric columns are numeric
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="coerce")

df.dtypes

In [None]:
df.describe().T

In [None]:
summary_metrics = {
    "Total Students": df["students"].sum(),
    "Total Teachers": df["teachers"].sum(),
    "Average Students per Record": df["students"].mean(),
    "Average Teachers per Record": df["teachers"].mean()
}

pd.DataFrame.from_dict(summary_metrics, orient="index", columns=["Value"])


In [None]:
plt.figure(figsize=(8, 5))
sns.histplot(df["students"], bins=30, kde=True)
plt.title("Distribution of Student Counts")
plt.xlabel("Number of Students")
plt.ylabel("Frequency")
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
sns.boxplot(x=df["students"])
plt.title("Student Count Boxplot")
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.histplot(df["teachers"], bins=30, kde=True)
plt.title("Distribution of Teacher Counts")
plt.xlabel("Number of Teachers")
plt.ylabel("Frequency")
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
sns.boxplot(x=df["teachers"])
plt.title("Teacher Count Boxplot")
plt.show()

In [None]:
regional_summary = (
    df.groupby("region")[["students", "teachers"]]
      .agg(["mean", "median", "min", "max", "sum"])
      .round(2)
)

regional_summary

In [None]:
category_summary = (
    df.groupby("school_category")[["students", "teachers"]]
      .agg(["mean", "median", "sum"])
      .round(2)
)

category_summary

### Key Initial Observations

1. Student and teacher counts exhibit right-skewed distributions, indicating
   concentration in larger school divisions or regions.
2. Preliminary summaries suggest variability in staffing relative to enrollment,
   motivating teacherâ€“student ratio analysis in subsequent notebooks.
3. Some regions and school categories show substantial variance, which warrants
   inequality and geographic analysis.
4. Missing values are minimal / present in specific fields and will be addressed
   during preprocessing steps.

These descriptive findings provide the empirical baseline for trend, ratio,
and policy analyses that follow.