In [None]:
import os
import pandas as pd
import spacy
from glob import glob
from tqdm import tqdm
import lftk

# Path where the files are stored
data_folder = "raw_data"
files = sorted(glob(f"{data_folder}/*.train"))

# Load a trained pipeline of your choice from spaCy
nlp = spacy.load("en_core_web_sm")

# Specify features to be extracted
to_be_extracted = ["a_bry_pw", "a_kup_pw", "fkgl", "fogi", "cole", "auto", "rt_fast", "rt_average", "rt_slow"]

# Iterate over all the files
for single_file in files:
    print(f"Processing {single_file}")

    with open(single_file, 'r') as f:
        lines = f.readlines()

    # Group lines by 32
    grouped_lines = [' '.join(lines[i:i+32]) for i in range(0, len(lines), 32)]

    # Create a list to store extracted features for each group
    all_extracted_lines = []

    # Process each group
    for line in tqdm(grouped_lines):
        doc = nlp(line)
        LFTK = lftk.Extractor(docs=doc)
        LFTK.customize(stop_words=True, punctuations=False, round_decimal=3)
        extracted_features = LFTK.extract(features=to_be_extracted)
        all_extracted_lines.append(extracted_features)

    df = pd.DataFrame(all_extracted_lines)
    df['MergedLines'] = grouped_lines[:len(all_extracted_lines)]
    df['MeanAgeOfAcquisition'] = df.iloc[:, 0:2].mean(axis=1)
    df['MeanReadabilityGradeLevel'] = df.iloc[:, 2:6].mean(axis=1)
    df['MeanReadTime'] = df.iloc[:, 6:9].mean(axis=1)

    # Standardize the means
    df['StandardizedMeanAgeOfAcquisition'] = (df['MeanAgeOfAcquisition'] - df['MeanAgeOfAcquisition'].mean()) / df['MeanAgeOfAcquisition'].std()
    df['StandardizedMeanReadabilityGradeLevel'] = (df['MeanReadabilityGradeLevel'] - df['MeanReadabilityGradeLevel'].mean()) / df['MeanReadabilityGradeLevel'].std()
    df['StandardizedMeanReadTime'] = (df['MeanReadTime'] - df['MeanReadTime'].mean()) / df['MeanReadTime'].std()

    # Calculate the cumulative score
    df['CumulativeDifficultyScore'] = df['StandardizedMeanAgeOfAcquisition'] + df['StandardizedMeanReadabilityGradeLevel'] + df['StandardizedMeanReadTime']

    # Drop unnecessary columns
    df.drop(['a_bry_pw', 'a_kup_pw', 'fkgl', 'fogi', 'cole', 'auto', 'rt_fast', 'rt_average', 'rt_slow'], axis=1, inplace=True)

    # Generate output file name and path
    output_file = single_file.split('/')[-1].replace('.train', '_preprocessed.xlsx')
    output_path = os.path.join("preprocessed_files/", output_file)

    # Save to Excel
    df.to_excel(output_path, index=True)
    print(f"Saved to {output_path}")
