## 1. Data extraction

Extract text contained in json files and save it in a dataframe for further pre-processing.

In [5]:
from tqdm.notebook import tqdm, tqdm_notebook
tqdm_notebook.pandas()
import spacy, en_core_web_sm
import numpy as np
import pandas as pd
import json

In [8]:
# Check the different keys in the json file
for id_file in tqdm(range(1,14)):
    file_path = "./../Data/Original_data/" + str(id_file) + ".json"

    with open(file_path, encoding='utf-8') as f:
        data = json.load(f)

        # Loop over each document
        keys = []
        for i, doc in enumerate(data):
            for key, value in doc.items():
                keys.append(key)

        myset = set(keys)
        print("File {} - Keys: {} - Documents: {}".format(id_file, myset, len(data)))

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))

<_io.TextIOWrapper name='./../Data/Original_data/1.json' mode='r' encoding='utf-8'>
File 1 - Keys: {'text', 'uri'} - Documents: 11699
<_io.TextIOWrapper name='./../Data/Original_data/2.json' mode='r' encoding='utf-8'>
File 2 - Keys: {'text', 'uri'} - Documents: 32072
<_io.TextIOWrapper name='./../Data/Original_data/3.json' mode='r' encoding='utf-8'>
File 3 - Keys: {'text', 'uri'} - Documents: 8225
<_io.TextIOWrapper name='./../Data/Original_data/4.json' mode='r' encoding='utf-8'>
File 4 - Keys: {'text', 'uri'} - Documents: 77258
<_io.TextIOWrapper name='./../Data/Original_data/5.json' mode='r' encoding='utf-8'>
File 5 - Keys: {'text', 'uri'} - Documents: 46079
<_io.TextIOWrapper name='./../Data/Original_data/6.json' mode='r' encoding='utf-8'>
File 6 - Keys: {'text', 'uri'} - Documents: 28106
<_io.TextIOWrapper name='./../Data/Original_data/7.json' mode='r' encoding='utf-8'>
File 7 - Keys: {'text', 'uri'} - Documents: 27391
<_io.TextIOWrapper name='./../Data/Original_data/8.json' mode='

In [10]:
rows = []
for id_file in tqdm(range(1, 14)):
    file_path = "./../Data/Original_data/" + str(id_file) + ".json"

    with open(file_path, encoding='utf-8') as f:
        print("Loading file {}...".format(id_file))
        data = json.load(f)  # data is a list of dict of the form: {'text':['...'], 'uri':['...']}

        print("Extracting text from {} documents in file...".format(len(data), id_file))
        for i, doc in enumerate(data):
            text = doc.get('text') # Get the text of the current doc
            if text is not None:
                row_dict = {'Text': text[0], 'Length': len(text[0])}
                rows.append(row_dict)

print("Creating dataframe...")
df = pd.DataFrame(rows)
print("Done !")

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))

Loading file 1...
Extracting text from 11699 documents in file...
Loading file 2...
Extracting text from 32072 documents in file...
Loading file 3...
Extracting text from 8225 documents in file...
Loading file 4...
Extracting text from 77258 documents in file...
Loading file 5...
Extracting text from 46079 documents in file...
Loading file 6...
Extracting text from 28106 documents in file...
Loading file 7...
Extracting text from 27391 documents in file...
Loading file 8...
Extracting text from 24143 documents in file...
Loading file 9...
Extracting text from 22223 documents in file...
Loading file 10...
Extracting text from 20979 documents in file...
Loading file 11...
Extracting text from 57160 documents in file...
Loading file 12...
Extracting text from 85900 documents in file...
Loading file 13...
Extracting text from 793 documents in file...

Creating dataframe...
Done !


In [11]:
print("Max number of characters in a doc: {}".format(df.Length.max()))
print("Total number of docs: {}".format(len(df.index)))

Max number of characters in a doc: 2621440
Total number of docs: 442019


## 2. Data Processing

In order to use "create_pretraining_data.py" from BERT repository, the input must be a plain text file, with one sentence per line and one blank line between documents:

  * One sentence per line. These should ideally be actual sentences, not entire paragraphs or arbitrary spans of text. (Because we use the sentence boundaries for the "next sentence prediction" task).
  * Blank lines between documents. Document boundaries are needed so that the "next sentence prediction" task doesn't span between documents.
  
They advise to perform sentence segmentation with an off-the-shelf NLP toolkit such as spaCy.

### 2.1. Corpus Cleaning

In [12]:
print("Cleaning corpus of text...")
df.Text = df.Text.replace('\s+', ' ', regex=True)  # Remove duplicate spaces
df.Text = df.Text.str.encode('ascii', 'ignore').str.decode('utf-8')   # Encode in ascii to remove weird characters such as \uf0a7
df.Text = df.Text.str.lower()  # Lower case all strings (I have noticed a better segmentation by doing that)
print("Done !")

Cleaning corpus of text...
Done !


### 2.2. Sentence segmentation

In [13]:
def sent_segmentation(doc_text):
    """
    Given a string, segment it by sentences.
    """
    nlp = en_core_web_sm.load()
    nlp.max_length = 2621500  # because larger document has a size of 2621440 char
    doc = nlp(doc_text)
    sentences = list(doc.sents)
    return [sent.text for sent in sentences]

### 2.3. Sentence cleaning

In [14]:
def sent_cleaning(list_sent):
    """
    """
    # If line begins with a number, remove the number   
    list_sent = [sent.split(maxsplit=1)[1] if (sent.split(maxsplit=1)[0].isdigit() and len(sent.split(maxsplit=1)) > 1) else sent for sent in list_sent]
    
    # If line begins with a special char, remove that char
    spec_char = set(',?;.:/=+%`¨^*$€-_())°!§\'\"&@#~®†ºπ‡¬≈©◊~∞µ…÷≠<>')
    list_sent = [sent.split(maxsplit=1)[1] if (len(sent.split(maxsplit=1)) > 1 and sent[0] in spec_char) else sent for sent in list_sent]
    
    # Keep only sentences that have less that 15 special characters
    list_sent = [sent for sent in list_sent if max([sent.count(c) for c in spec_char]) < 15]

    # Keep only sentences with more than 2 words
    list_sent = [sent for sent in list_sent if len(sent.split()) > 2]
    return list_sent

### 2.4. Plain text conversion

In [15]:
def sent_convert(list_sent):
    """
    Given a list of string sentences, return one unique string where
    sentences are separated by newlines.
    """
    return "\n".join(list_sent)   

### 2.5. Apply all cleaning functions 

In [None]:
print("Segmenting sentences...")
df['Text'] = df['Text'].progress_apply(sent_segmentation)
print("Cleaning sentences...")
df['Text'] = df['Text'].progress_apply(sent_cleaning)
print("Concatenating all sentences...")
df['Text'] = df['Text'].progress_apply(sent_convert)

Segmenting sentences...


HBox(children=(FloatProgress(value=0.0, max=442019.0), HTML(value='')))

### 2.6. Concatenate all documents

In [None]:
final_text = "\n\n".join(df["Text"])
with open("../Data/output.txt", "w+") as f:
    f.write(final_text)
print("DONE !")