In [None]:
import os
import re
import glob

print("Preprocessing Notebook: Cleaning .txt and .m files...")

In [None]:
# 1) Clean .txt files into processed_txt_files

def clean_text(content):
    # Remove non-ASCII and excessive whitespace
    cleaned = re.sub(r'[^\x00-\x7F]+', ' ', content)
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    return cleaned

input_folder = "./files"
output_folder_txt = "./files/processed_txt_files"
os.makedirs(output_folder_txt, exist_ok=True)

txt_files = [f for f in os.listdir(input_folder) if f.endswith('.txt')]

for txt_file in txt_files:
    txt_path = os.path.join(input_folder, txt_file)
    with open(txt_path, 'r', encoding='utf-8', errors='ignore') as f:
        raw = f.read()
    cleaned_content = clean_text(raw)
    out_path = os.path.join(output_folder_txt, txt_file)
    with open(out_path, 'w', encoding='utf-8') as out_f:
        out_f.write(cleaned_content)
    print(f"Processed TXT: {txt_file}")

print("All .txt files processed into 'processed_txt_files' folder!")

In [None]:
# 2) Clean .m files into processed_m_files

def process_matlab_files(folder_path):
    out_folder = os.path.join(folder_path, "processed_m_files")
    os.makedirs(out_folder, exist_ok=True)

    matlab_files = [f for f in os.listdir(folder_path) if f.endswith('.m')]

    for mf in matlab_files:
        in_path = os.path.join(folder_path, mf)
        out_path = os.path.join(out_folder, f"cleaned_{mf}")

        with open(in_path, 'r', encoding='utf-8', errors='ignore') as f:
            lines = f.readlines()

        cleaned_lines = []
        for line in lines:
            ln = line.strip()
            if ln.startswith('%') or ln.startswith('function') or re.match(r'^\s*\w+\s*=\s*', ln):
                cleaned_lines.append(ln)

        with open(out_path, 'w', encoding='utf-8') as out_f:
            out_f.write("\n".join(cleaned_lines))

        print(f"Processed M-file: {mf}")

process_matlab_files("./files")
print("All .m files processed into 'processed_m_files' folder!")