**This is part 2/2 for the notebooks.**

This section is for the complete code that will be used in the final product. For additional information, please visit 1_data_training.ipynb

In [None]:
import csv
import re
import os

# These texts in the CEFRTexts folder are supplemented with additional texts.
def extract_metadata(text):
    # Define patterns for each metadata field. This is altered after the original attempt.
    patterns = {
        "Overall CEFR rating": r"Overall CEFR rating: (.+)",
        "Learner text": r"Learner text:\s+([\s\S]+?)$"  # Match all text after "Learner text:"
    }
    
    extracted_data = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.MULTILINE)
        extracted_data[key] = match.group(1).strip() if match else ""
    return extracted_data

def process_folder(input_folder, output_file):
    # List all .txt files in the folder
    files = [f for f in os.listdir(input_folder) if f.endswith('.txt')]
    if not files:
        print("No .txt files found in the specified folder.")
        return

    # Process each file and collect data
    all_data = []
    for file in files:
        file_path = os.path.join(input_folder, file)
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        metadata = extract_metadata(text)
        metadata['Filename'] = file  # Add filename for reference
        all_data.append(metadata)
    
    # Write all data to a single CSV
    with open(output_file, mode='w', newline='', encoding='utf-8') as csvfile:
        fieldnames = list(all_data[0].keys())  # Use keys from the first entry
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(all_data)

    print(f"Data from {len(files)} files extracted and written to {output_file}")

def main():
    input_folder = "CEFRTexts"  # Containing .txt files
    output_file = "CEFRRaw.csv"     # Desired output CSV file

    process_folder(input_folder, output_file)

if __name__ == "__main__":
    main()


Data from 1073 files extracted and written to CEFRRaw.csv
