In [1]:
# Load Libraries
import pandas as pd
import numpy as np
import re
pd.set_option('display.max_columns', None) 


### Student Teacher Grade

In [2]:
# Load data
dtype_fix = {
    'GradeLevel': 'category',
    'CourseNumber': 'str',
    'DOECourseNumber': 'str',
    'StartPeriodCode': 'str',
    'EndPeriodCode': 'str',
    'AttemptedCreditOverrideReason': 'str',
    'EarnedCreditOverrideReason': 'str',
    'mask_studentpersonkey': 'str'
}

sg2025 = pd.read_csv('Student Teacher Grade 2025.csv', dtype=dtype_fix, low_memory=False)
sg2024 = pd.read_csv('Student Teacher Grade 2024.csv', dtype=dtype_fix, low_memory=False)
sg2023 = pd.read_csv('Student Teacher Grade 2023.csv', dtype=dtype_fix, low_memory=False)
sg2022 = pd.read_csv('Student Teacher Grade 2022.csv', dtype=dtype_fix, low_memory=False)
# Combine files
sg = pd.concat([sg2022, sg2023, sg2024, sg2025], ignore_index=True)

In [3]:
# Remove unused columns
colums_to_remove = ['RoomNumber','SectionIdentifier','SubjectAreaCreditCode',
                    'SubjectAreaCreditDesc','AttemptedCreditOverrideFlag',
                    'AttemptedCreditOverrideReason','EarnedCreditOverrideFlag',
                    'EarnedCreditOverrideReason','InstructionalSettingCode']
sg = sg.drop(columns=[col for col in colums_to_remove if col in sg.columns])
sg.shape

(8502796, 23)

In [11]:
# Data cleaning

def remove_html_tags(text):
    if isinstance(text, str):
        return re.sub(r'<.*?>', '', text)
    return text

def truncate_text(text, length=40):
    if isinstance(text, str) and len(text) > length:
        return text[:length-3] + "..."
    return text

sg['CourseDesc'] = sg['CourseDesc'].apply(remove_html_tags)
sg['CourseDesc'] = sg['CourseDesc'].apply(truncate_text)

In [12]:
sg.to_csv('StudentTeacherGradeCombined.csv', index=False)

### Illuminate Education 

In [5]:
illuminate2022 = pd.read_csv('IlluminateData2022.csv',encoding="cp1252", low_memory=False)
illuminate2023 = pd.read_csv('IlluminateData2023.csv',encoding="cp1252", low_memory=False)
illuminate2024 = pd.read_csv('IlluminateData2024.csv',encoding="cp1252", low_memory=False)
illuminate2025 = pd.read_csv('IlluminateData2025.csv',encoding="cp1252", low_memory=False)
illuminate = pd.concat([illuminate2022, illuminate2023, illuminate2024, illuminate2025], ignore_index=True)

In [6]:
illuminate.head()

Unnamed: 0,schoolyearnumberspring,unitnumber,Mask_StudentPersonkey,title,AssessmentId,responsedatevalue,DateValue,Response_points,Response_points_possible,Response_percent_correct,Department,rn,CurrentSchoolName,standard_id,StandardStateNumber,Standard_Subject,Standard_points,Standard_points_possible,Standard_percent_correct,condition,pointsvarchar,categorytitle,stdgrouping,SchoolId,GradeLevelDuringUnitTest,ContinuousAchievementLevel,AssessmentGradeLevel,CurrentSchoolDetailFCSID
0,2022,1,763538,math,187897,2021-09-19 00:00:00,2021-09-19 00:00:00,5.0,15.0,33.33,math,1,A. Philip Randolph Elementary School,91332,MGSE3.NBT.1,Mathematics,1.0,7.0,14.29,Reteach,1/7,Grade 3,MGSE3.NBT.1 Use place value understanding to r...,29,3,On Level,3,492
1,2022,6,862425,language arts,232138,2022-05-11 00:00:00,2022-05-11 00:00:00,6.0,8.0,75.0,language arts,1,Bear Creek Middle,131492,ELAGSE6RI6_3.0_2,English Language Arts,1.0,1.0,100.0,Extension,1/1,Grades: 6-8,ELAGSE6RI6_3.0_2 ELAGSE6RI6_3.0_2 Determine an...,37,6,On Level,6,696
2,2022,4,582651,math,191184,2022-01-28 00:00:00,2022-01-28 00:00:00,16.0,17.0,94.12,math,1,Oakley Elementary,90369,4,Mathematics,2.0,2.0,100.0,Extension,2/2,Mathematical Practices,4 Model with mathematics.,108,3,On Level,3,615
3,2022,2,584570,language arts,195554,2021-10-29 00:00:00,2021-10-29 00:00:00,7.0,10.0,70.0,language arts,1,Conley Hills Elementary,131615,ELAGSE4RI2_3.0_3,English Language Arts,1.0,1.0,100.0,Extension,1/1,Grade 4,ELAGSE4RI2_3.0_3 ELAGSE4RI2_3.0_3 Summarize th...,17,4,On Level,4,120
4,2022,6,550130,math,227708,2022-02-23 00:00:00,2022-02-23 00:00:00,13.0,15.0,86.67,math,1,Crabapple Middle,91479,MGSE8.F.5,Mathematics,2.0,2.0,100.0,Extension,2/2,Grade 8,MGSE8.F.5 Describe qualitatively the functiona...,64,7,Accelerated,8,686


In [7]:
illuminate.isnull().sum()

schoolyearnumberspring              0
unitnumber                          0
Mask_StudentPersonkey               0
title                               0
AssessmentId                        0
responsedatevalue                   0
DateValue                           0
Response_points                     0
Response_points_possible            0
Response_percent_correct            0
Department                          0
rn                                  0
CurrentSchoolName                   0
standard_id                         0
StandardStateNumber                 0
Standard_Subject                    0
Standard_points                     0
Standard_points_possible            0
Standard_percent_correct            0
condition                           0
pointsvarchar                       0
categorytitle                       0
stdgrouping                         0
SchoolId                            0
GradeLevelDuringUnitTest          427
ContinuousAchievementLevel    3736413
AssessmentGr

In [8]:
def clean_title_column(df):
    # Define a mapping of common variations to standardized titles
    title_mapping = {
        "language arts": "Language Arts",
        "Language arts": "Language Arts",
        "LANGUAGE ARTS": "Language Arts",
        "Language Arts": "Language Arts",
        "math": "Math",
        "MATH": "Math",
        "Math": "Math",
        "social studies": "Social Studies",
        "Social Studies": "Social Studies",
        "SCIENCE": "Science",
        "science": "Science",
        "Science": "Science",
        # Add more mappings as needed
    }

    # Normalize by stripping whitespace and applying lowercase
    df['title'] = df['title'].str.strip()
    df['Department'] = df['Department'].str.strip()
    # Replace using mapping (case-insensitive)
    df['title'] = df['title'].apply(lambda x: title_mapping.get(x, x.title()))
    df['Department'] = df['Department'].apply(lambda x: title_mapping.get(x, x.title()))
    return df
illuminate = clean_title_column(illuminate)


def clean_grade_levels(df):
    def format_grade(x):
        # Convert to string first
        x_str = str(x).strip()

        # Handle Kindergarten labels
        if x_str.upper() in ['KK', 'KINDERGARTEN', 'K']:
            return 'K'

        # If float or numeric string, convert to int then format as two-digit string
        try:
            x_float = float(x_str)
            x_int = int(x_float)
            return f"{x_int:02d}"
        except ValueError:
            # If cannot convert, return original string (or you can return None)
            return x_str

    df['GradeLevelDuringUnitTest'] = df['GradeLevelDuringUnitTest'].apply(format_grade)
    df['AssessmentGradeLevel'] = df['AssessmentGradeLevel'].apply(format_grade)
    return df
illuminate = clean_grade_levels(illuminate)

In [10]:
illuminate.to_csv('IlluminateCombined.csv', index=False)