## Install Datasets

**JFLEG Dataset**

```bash
git clone https://huggingface.co/datasets/jhu-clsp/jfleg
```

**Lang-8 Dataset**

You can download the Lang-8 corpora from the download page.
[Download the Lang-8 corpora](http://docs.google.com/forms/d/17gZZsC_rnaACMXmPiab3kjqBEtRHPMz0UG9Dk-x_F0k)

**Cola Dataset**

[Download the CoLA dataset](https://nyu-mll.github.io/CoLA/cola_public_1.1.zip)


# Preprocess Dataset

In [8]:
import pandas as pd
from tqdm import tqdm
import os

def read_dataset(base_path, file_name, columns, usecols=None, explode_col=None, rename_cols=None, sep=','):
    file_path = os.path.join(base_path, file_name)
    if file_name.endswith('.csv') or file_name.endswith('.tsv'):
        df = pd.read_csv(file_path, header=None, names=columns, sep=sep)
    elif file_name.endswith('.parquet'):
        df = pd.read_parquet(file_path)
    else:
        raise ValueError(f"Unsupported file format for {file_name}")
    
    if usecols:
        df = df[usecols]
    if explode_col:
        df = df.explode(explode_col).reset_index(drop=True)
    if rename_cols:
        df = df.rename(columns=rename_cols)
    
    return df

def preprocess_df(df, target_prefix='', input_prefix=''):
    replacements = [
        (" \.", "."), 
        (" ,", ","),
        (" '", "'"),
        (" \?", "?"),
        (" !", "!"),
        (" :", ":"),
        (" ;", ";"),
        (" n't", " not"),
        (" v", " have"),
        ("2 0 0 6", "2006"),
        ("5 5", "55"),
        ("4 0 0", "400"),
        ("1 7-5 0", "1750"),
        ("2 0 %", "20%"),
        ("5 0", "50"),
        ("1 2", "12"),
        ("1 0", "10"),
        (" 'll", " will"),
        (" 're", " are"),
        (" 's", " is"),
        (" 've", " have"),
        (" 'd", " would"),
        (" 'm", " am"),
        (" 'em", " them"),
        (" 'clock", "'clock"),
        (" 't", " not"),
        (" 'cause", "'cause"),
        (" 'til", " until"),
        (" 'bout", " about"),
        (" 'round", " around"),
        ("\"", ""),
    ]
    df['input'] = df['input'].fillna('')
    df['target'] = df['target'].fillna(df['input'])
    
    for rep in replacements:
        df['input'] = df['input'].str.replace(rep[0], rep[1], regex=True)
        df['target'] = df['target'].str.replace(rep[0], rep[1], regex=True)

    df = df.map(lambda x: x.replace('"', '') if isinstance(x, str) else x)
    if target_prefix:
        df['target'] = target_prefix + df['target']
    if input_prefix:
        df['input'] = input_prefix + df['input']
    
    return df.sample(frac=1).reset_index(drop=True)

base_raw_path = '../data/raw'
base_processed_path = '../data/processed'

# COLA Dataset
cola_columns = ['sentence_source', 'target', 'target_notes', 'sentence']
cola_test_df = read_dataset(base_raw_path, 'cola_public/in_domain_dev.tsv', cola_columns, usecols=['sentence', 'target'], sep='\t')
cola_train_df = read_dataset(base_raw_path, 'cola_public/in_domain_train.tsv', cola_columns, usecols=['sentence', 'target'], sep='\t')
cola_df = pd.concat([cola_test_df, cola_train_df], ignore_index=True)
cola_df.to_csv(os.path.join(base_processed_path, 'test/cola.csv'), index=False)

# JFLEG Dataset
jfleg_train_df = read_dataset(base_raw_path, 'jfleg/validation-00000-of-00001.parquet', None, explode_col='corrections', rename_cols={'corrections': 'target', 'sentence': 'input'})
jfleg_test_df = read_dataset(base_raw_path, 'jfleg/test-00000-of-00001.parquet', None, explode_col='corrections', rename_cols={'corrections': 'target', 'sentence': 'input'})

# Lang-8 Dataset
lang8_columns = ['num_corrections', 'serial_number', 'url', 'sentence_number', 'sentence', 'corrections_0', 'corrections_1', 'corrections_2', 'corrections_3', 'corrections_4', 'corrections_5', 'corrections_6', 'corrections_7']
lang8_train_df = read_dataset(base_raw_path, 'lang-8-en-1.0/train.csv', lang8_columns, usecols=['sentence', 'corrections_0'], rename_cols={'corrections_0': 'target', 'sentence': 'input'})
lang8_test_df = read_dataset(base_raw_path, 'lang-8-en-1.0/test.csv', lang8_columns, usecols=['sentence', 'corrections_0'], rename_cols={'corrections_0': 'target', 'sentence': 'input'})

# Concatenate and Preprocess
data_train_df = pd.concat([jfleg_train_df, lang8_train_df], ignore_index=True)
data_test_df = pd.concat([jfleg_test_df, lang8_test_df], ignore_index=True)

train_df = preprocess_df(data_train_df, target_prefix='correct: ', input_prefix='grammar: ')
test_df = preprocess_df(data_test_df, target_prefix='correct: ', input_prefix='grammar: ')

def extract_first_sentence(text):
    # Define a regular expression pattern to match the first sentence
    pattern = r'^.*?[.!?]'

    # Find the first match of the pattern in the text
    match = re.search(pattern, text)

    if match:
        # Extract the matched sentence
        first_sentence = match.group(0)
        return first_sentence.strip()  # Remove leading/trailing whitespace
    else:
        return text

tqdm.pandas()

# Apply the function to extract the first sentence
train_df['input'] = train_df['input'].progress_apply(extract_first_sentence)
train_df['target'] = train_df['target'].progress_apply(extract_first_sentence)

train_df.to_csv('../data/raw/train.csv', index=False)
test_df.to_csv('../data/raw/test.csv', index=False)

# At this point, `train_df` and `test_df` are ready for further use
train_df

  df = pd.read_csv(file_path, header=None, names=columns, sep=sep)
100%|██████████| 1003020/1003020 [00:00<00:00, 1053080.37it/s]
100%|██████████| 1003020/1003020 [00:00<00:00, 1017686.58it/s]


Unnamed: 0,input,target
0,grammar: But I found out that her parents got ...,correct: But I found out that her parents got ...
1,grammar: And you?,correct: And you?
2,grammar:,correct:
3,grammar: A haveiolence by the Yokoduna is a fi...,correct: This haveiolence by the Yokozuna is a...
4,grammar: My university started last monday and...,"correct: My university, as well as some clubs,..."
...,...,...
1003015,grammar: I will never fotgot that moment.,correct: I will never forget that moment.
1003016,grammar: Work so well in the learning of Japan...,correct: Work so well in the learning of Japan...
1003017,grammar: My favorite foreign drama program,correct: My favorite foreign drama program
1003018,grammar: Hi!,correct: Hi!


In [1]:
import pandas as pd
import numpy as np
train_df = pd.read_csv('../data/processed/train/tokenized_train.csv')

n = 40
split_dfs = np.array_split(train_df, n)
for i, df in enumerate(split_dfs):
    df.to_csv(f'../data/processed/train_{i}.csv', index=False)

  return bound(*args, **kwds)


In [2]:
train_df

Unnamed: 0,input,target
0,grammar: But I found out that her parents got ...,correct: But I found out that her parents got ...
1,grammar: And you?,correct: And you?
2,grammar:,correct:
3,grammar: A haveiolence by the Yokoduna is a fi...,correct: This haveiolence by the Yokozuna is a...
4,grammar: My university started last monday and...,"correct: My university, as well as some clubs,..."
...,...,...
1003015,grammar: I will never fotgot that moment.,correct: I will never forget that moment.
1003016,grammar: Work so well in the learning of Japan...,correct: Work so well in the learning of Japan...
1003017,grammar: My favorite foreign drama program,correct: My favorite foreign drama program
1003018,grammar: Hi!,correct: Hi!
