importing student data files.zip

In [1]:
import zipfile
import os

zip_path = "student_files.zip"
extract_folder = "student_files"
os.makedirs(extract_folder, exist_ok=True)

# Extract the zip
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

print(f"All files extracted to: {extract_folder}")


All files extracted to: student_files


In [None]:
cleaning the student files

In [2]:
import pandas as pd
import os
import glob
import re

# Folders
input_folder = 'student_files'          # Folder with original files
output_folder = 'cleaned_student_files' # Folder to save cleaned files
os.makedirs(output_folder, exist_ok=True)

# Function to detect empty cells (robustly)
def is_empty(cell):
    if pd.isna(cell):
        return True
    cleaned = re.sub(r'\s+', '', str(cell))  # remove all whitespace and invisible chars
    return cleaned == ""

# Get all Excel files
excel_files = glob.glob(os.path.join(input_folder, '*.xlsx'))

for file in excel_files:
    df = pd.read_excel(file)

    # Strip column names
    df.columns = df.columns.str.strip()

    # Check that both Name and Username columns exist
    if 'Name' in df.columns and 'Username' in df.columns:
        # Remove rows where both Name and Username are empty
        mask = ~(df['Name'].apply(is_empty) & df['Username'].apply(is_empty))
        df_cleaned = df[mask].reset_index(drop=True)

        # Save cleaned file
        filename = os.path.basename(file)
        output_path = os.path.join(output_folder, filename)
        df_cleaned.to_excel(output_path, index=False)
        print(f"Processed and saved: {output_path}")
    else:
        print(f"Skipping {file}: missing Name or Username columns")

print("All files processed successfully!")



All files processed successfully!


In [None]:
converting the cleaned student files to anonymized output

In [3]:
import pandas as pd
import os
import glob
import random
import string
import re

# Folder containing your files
input_folder = 'cleaned_student_files'  # adjust to your folder
output_folder = 'anonymized_output'
os.makedirs(output_folder, exist_ok=True)

# Function to generate fake names
def fake_name(index):
    return f"Student {index}"

# Function to generate fake usernames
def fake_username(index):
    letters = ''.join(random.choices(string.ascii_lowercase, k=3))
    return f"user{index}_{letters}"

# Function to detect empty cells (robustly)
def is_empty(cell):
    if pd.isna(cell):
        return True
    # Remove all whitespace and invisible characters
    cleaned = re.sub(r'\s+', '', str(cell))
    return cleaned == ""

# Process all Excel files
excel_files = glob.glob(os.path.join(input_folder, '*.xlsx'))

for file in excel_files:
    df = pd.read_excel(file)

    # Strip whitespace from column names
    df.columns = df.columns.str.strip()

    # Only process if both required columns exist
    if 'Name' in df.columns and 'Username' in df.columns:
        # Remove rows where BOTH Name and Username are empty
        mask = ~(df['Name'].apply(is_empty) & df['Username'].apply(is_empty))
        df = df[mask].reset_index(drop=True)

        # Anonymize Name and Username
        df['Name'] = [fake_name(i+1) for i in range(len(df))]
        df['Username'] = [fake_username(i+1) for i in range(len(df))]

        # Save to output folder
        filename = os.path.basename(file)
        output_path = os.path.join(output_folder, filename)
        df.to_excel(output_path, index=False)
        print(f"Anonymized file written to: {output_path}")
    else:
        print(f"Skipping {file}: missing Name or Username columns.")

print("All files processed successfully!")


All files processed successfully!
