## 1. Data extraction

Extract text contained in json files and save it in a dataframe for further pre-processing.

In [67]:
#%%capture
from tqdm.notebook import tqdm_notebook, tqdm
tqdm_notebook.pandas()
import numpy as np
import pandas as pd
#import cudf
import json

In [68]:
# Check the different keys in the json file
for id_file in tqdm(range(1,14)):
    file_path = "./../Data/Original_data/" + str(id_file) + ".json"

    with open(file_path) as f:
        data = json.load(f)

        # Loop over each document
        keys = []
        for i, doc in enumerate(data):
            for key, value in doc.items():
                keys.append(key)

        myset = set(keys)
        print("File {} - Keys: {} - Documents: {}".format(id_file, myset, len(data)))

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))

MemoryError: 

In [None]:
rows = []
for id_file in tqdm(range(1, 14)):
    file_path = "./../Data/Original_data/" + str(id_file) + ".json"

    with open(file_path) as f:
        print("Loading file {}...".format(id_file))
        data = json.load(f)  # data is a list of dict of the form: {'text':['...'], 'uri':['...']}

        print("Extracting text from {} documents in file...".format(len(data), id_file))
        for i, doc in enumerate(data):
            text = doc.get('text') # Get the text of the current doc
            if text is not None:
                row_dict = {'Text': text[0]}
                rows.append(row_dict)

# Create dataframe
print("Creating dataframe...")
df = pd.DataFrame(rows)  #df = cudf.DataFrame.from_pandas(pd_df.head())
print("Done !")

## 2. Data Processing

In order to use "create_pretraining_data.py" from BERT repository, the input must be a plain text file, with one sentence per line and one blank line between documents:

  * One sentence per line. These should ideally be actual sentences, not entire paragraphs or arbitrary spans of text. (Because we use the sentence boundaries for the "next sentence prediction" task).
  * Blank lines between documents. Document boundaries are needed so that the "next sentence prediction" task doesn't span between documents.
  
They advise to perform sentence segmentation with an off-the-shelf NLP toolkit such as spaCy.

### 2.1. Corpus Cleaning

In [None]:
df.Text = df.Text.replace('\s+', ' ', regex=True)  # Remove duplicate spaces
df.Text = df.Text.str.encode('ascii', 'ignore').str.decode('utf-8')   # Encode in ascii to remove weird characters such as \uf0a7
df.Text = df.Text.str.lower()  # Lower case all strings (I have noticed a better segmentation by doing that)

### 2.2. Sentence segmentation

In [62]:
import spacy, en_core_web_sm
nlp = en_core_web_sm.load()

def sent_segmentation(doc_text):
    """
    Given a string, segment it by sentences.
    """
    doc = nlp(doc_text)
    sentences = list(doc.sents)
    return [sent.text for sent in sentences]

### 2.3. Sentence cleaning

In [63]:
def sent_cleaning(list_sent):
    """
    """
    # If line begins with a number, remove the number   
    list_sent = [sent.split(maxsplit=1)[1] if (sent.split(maxsplit=1)[0].isdigit() and len(sent.split(maxsplit=1)) > 1) else sent for sent in list_sent]
    
    # If line begins with a special char, remove that char
    spec_char = set(',?;.:/=+%`¨^*$€-_())°!§\'\"&@#~®†ºπ‡¬≈©◊~∞µ…÷≠<>')
    list_sent = [sent.split(maxsplit=1)[1] if (len(sent.split(maxsplit=1)) > 1 and sent[0] in spec_char) else sent for sent in list_sent]
    

    # Keep only sentences that have less that 15 special characters
    list_sent = [sent for sent in list_sent if max([sent.count(c) for c in spec_char]) < 15]

    # Keep only sentences with more than 2 words
    list_sent = [sent for sent in list_sent if len(sent.split()) > 2]
    
    return list_sent

### 2.4. Plain text conversion

In [64]:
def sent_convert(list_sent):
    """
    Given a list of string sentences, return one unique string where
    sentences are separated by newlines.
    """
    return "\n".join(list_sent)   

### 2.5. Apply all cleaning functions 

In [65]:
#df['Text'] = df['Text'].applymap(sent_segmentation)
#df['Text'] = df['Text'].applymap(sent_cleaning)
#df['Text'] = df['Text'].applymap(sent_convert)
print("Segmenting sentences...")
df['Text'] = df['Text'].progress_apply(sent_segmentation)
print("Cleaning sentences...")
df['Text'] = df['Text'].progress_apply(sent_cleaning)
print("Concatenating all sentences...")
df['Text'] = df['Text'].progress_apply(sent_convert)

Segmenting sentences...


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Cleaning sentences...


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Concatenating all sentences...


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




### 2.6. Concatenate all documents

In [66]:
final_text = "\n\n".join(df["Text"])
with open("../Data/output.txt", "w+") as f:
    f.write(final_text)

In [None]:
#-------------------------------------------------------------------------------------------------------------------------------------------------------