# Data Cleaning & Anonymisation for AI Assessment

This notebook simulates:
- Merging student metadata with assessment scores
- Dropping incomplete records
- Filtering invalid scores
- Anonymising names
- Exporting clean data for AI pipelines

In [1]:
import pandas as pd

scores = pd.read_csv("../data/exam_scores.csv")
meta = pd.read_csv("../data/student_metadata.csv")

In [2]:
scores.head(), meta.head()

(  student_id  exam_score submission_date
 0      S0001        81.8      2024-06-23
 1      S0002        81.9      2025-03-13
 2      S0003        61.4      2025-03-11
 3      S0004        75.1      2024-07-10
 4      S0005        39.1      2024-04-29,
   student_id       student_name                     school year_group
 0      S0001  Alejandro Escobar              Jenkins-Ortiz    Year 10
 1      S0002        Shannon Orr              Nelson-Miller    Year 12
 2      S0003         Riley Reid  Delgado, Yang and Padilla    Year 11
 3      S0004    Christine Brown                 Newman LLC    Year 10
 4      S0005    Jennifer Mosley          Rodriguez-Mcclain    Year 10)

In [3]:
merged = pd.merge(scores, meta, on="student_id", how="inner")
print(f"Merged shape: {merged.shape}")

Merged shape: (50, 6)


In [4]:
cleaned = merged.dropna()
cleaned = cleaned[cleaned["exam_score"].between(0, 100)]
cleaned["student_name"] = "ANON"

In [5]:
cleaned.to_csv("../output/cleaned_dataset.csv", index=False)
cleaned.head()

Unnamed: 0,student_id,exam_score,submission_date,student_name,school,year_group
0,S0001,81.8,2024-06-23,ANON,Jenkins-Ortiz,Year 10
1,S0002,81.9,2025-03-13,ANON,Nelson-Miller,Year 12
2,S0003,61.4,2025-03-11,ANON,"Delgado, Yang and Padilla",Year 11
3,S0004,75.1,2024-07-10,ANON,Newman LLC,Year 10
4,S0005,39.1,2024-04-29,ANON,Rodriguez-Mcclain,Year 10
