In [13]:
# Importing the Libraries
import pandas as pd

In [14]:
# Defining the Constants
preprocessed_files = []
CURRICULUM_FILES_PATH = 'curriculum_files/'
PREPROCESSED_FILES_PATH = 'preprocessed_files/'
FEATURE_NAMES = ['StandardizedMeanAgeOfAcquisition', 'StandardizedMeanReadabilityGradeLevel', 'StandardizedMeanReadTime','CumulativeDifficultyScore']
preprocessed_file_names = ['aochildes_preprocessed.xlsx', 'bnc_spoken_preprocessed.xlsx', 'cbt_preprocessed.xlsx', 'children_stories_preprocessed.xlsx', 'gutenberg_preprocessed.xlsx', 'open_subtitles_preprocessed.xlsx', 'qed_preprocessed.xlsx', 'simple_wikipedia_preprocessed.xlsx', 'switchboard_preprocessed.xlsx', 'wikipedia_preprocessed.xlsx']
curriculum_file_names = ['curriculum_of_' + file_name for file_name in preprocessed_file_names ]
LEVELS = [5, 10, 20]
TRAIN_DATA_PATH = 'train_data/'
CURRICULUMS = ['C1/', 'C2/', 'C3/', 'C4/']
CURRICULUMS = [TRAIN_DATA_PATH + curriculum for curriculum in CURRICULUMS]
LEVEL_PATHS = ['5-LEVEL/', '10-LEVEL/', '20-LEVEL/']
C1_PATHS = [CURRICULUMS[0] + level_path for level_path in LEVEL_PATHS]
C2_PATHS = [CURRICULUMS[1] + level_path for level_path in LEVEL_PATHS]
C3_PATHS = [CURRICULUMS[2] + level_path for level_path in LEVEL_PATHS]
C4_PATHS = [CURRICULUMS[3] + level_path for level_path in LEVEL_PATHS]
C_PATHS = [C1_PATHS, C2_PATHS, C3_PATHS, C4_PATHS]

In [15]:
def create_curriculum_for_file_all_levels (df_in, file_name_in):
    df = df_in.copy()
    for level in LEVELS:
        for feature_name in FEATURE_NAMES:
            df.sort_values(by=[feature_name], inplace=True)    
            # Create a new column called '5-Level'
            column_name = feature_name + '-'+ str(level) + '-Level'
            # Initialize the column with 0s
            df[column_name] = 0
            # for the first 1/level of the rows, assign 1 to the 'Level' column, for the next 1/level of the rows, assign 2 to the 'Level' column, and so on
            # Calculate the size of each partition
            partition_size = len(df) // level
            # Assign levels to each partition
            for i in range(level):
                start_idx = i * partition_size
                end_idx = start_idx + partition_size
                df.iloc[start_idx:end_idx, df.columns.get_loc(column_name)] = i + 1
            # Handle any remaining rows
            if len(df) % level != 0:
                df.iloc[end_idx:, df.columns.get_loc(column_name)] = level
    df.to_excel(CURRICULUM_FILES_PATH + 'curriculum_of_'+file_name_in, index=False)
def create_all():
    for file_name in preprocessed_file_names:
        preprocessed_files.append(pd.read_excel(PREPROCESSED_FILES_PATH + file_name))
    for i1 in range(len(preprocessed_files)):
        create_curriculum_for_file_all_levels(preprocessed_files[i1], preprocessed_file_names[i1])

In [16]:
def merge_curriculums(level: int):
    c1_feature_name = FEATURE_NAMES[0] + '-' + str(level) + '-Level'
    c2_feature_name = FEATURE_NAMES[1] + '-' + str(level) + '-Level'
    c3_feature_name = FEATURE_NAMES[2] + '-' + str(level) + '-Level'
    c4_feature_name = FEATURE_NAMES[3] + '-' + str(level) + '-Level'
    ftr_names = [c1_feature_name, c2_feature_name, c3_feature_name, c4_feature_name]
    for i3 in range (1, level+1):
        for i4 in range (len(ftr_names)):
            levels = []
            for curriculum_file in curriculum_file_names:
                df = pd.read_excel(CURRICULUM_FILES_PATH + curriculum_file)
                c_df = df[df[ftr_names[i4]] == i3]
                c_df = c_df[['MergedLines']]
                c_df.dropna(inplace=True)
                c_df['MergedLines'] = c_df['MergedLines'].str.split('\n')
                c_df = c_df.explode('MergedLines', ignore_index=True)
                c_df['MergedLines'] = c_df['MergedLines'].str.strip()
                c_df = c_df[~(c_df == "").all(axis=1)]
                levels.append(c_df)
            levels = pd.concat(levels)
            index = -1
            if level == 5:
                index = 0
            elif level == 10:
                index = 1
            elif level == 20:
                index = 2
            else: 
                print('Invalid level')
                # Exit the program 
                exit()
                
            levels.to_csv(C_PATHS[i4][index] + 'C' + str(i4+1) + '-' + str(level) + '-Level-' + str(i3) + '.train', index=False, header=False)

In [17]:
def main():
    create_all()
    merge_curriculums(5)
    merge_curriculums(10)
    merge_curriculums(20)

In [18]:
# Run the main function
main()