## 1. Data extraction

Extract text contained in json files and save it in a dataframe for further pre-processing.

In [12]:
import os
import json
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
tqdm.pandas()

import spacy, en_core_web_sm
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

  from pandas import Panel
[nltk_data] Downloading package punkt to /home/antoloui/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Check the different keys in the json file
for id_file in tqdm(range(1,14)):
    file_path = "../../Data/Original/" + str(id_file) + ".json"

    with open(file_path) as f:
        data = json.load(f)

        # Loop over each document
        keys = []
        for i, doc in enumerate(data):
            for key, value in doc.items():
                keys.append(key)

        myset = set(keys)
        print("File {} - Keys: {} - Documents: {}".format(id_file, myset, len(data)))

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))

File 1 - Keys: {'uri', 'text'} - Documents: 11699
File 2 - Keys: {'uri', 'text'} - Documents: 32072
File 3 - Keys: {'uri', 'text'} - Documents: 8225
File 4 - Keys: {'uri', 'text'} - Documents: 77258
File 5 - Keys: {'uri', 'text'} - Documents: 46079
File 6 - Keys: {'uri', 'text'} - Documents: 28106
File 7 - Keys: {'uri', 'text'} - Documents: 27391
File 8 - Keys: {'uri', 'text'} - Documents: 24143
File 9 - Keys: {'uri', 'text'} - Documents: 22223
File 10 - Keys: {'uri', 'text'} - Documents: 20979
File 11 - Keys: {'uri', 'text'} - Documents: 57160
File 12 - Keys: {'uri', 'text'} - Documents: 85900
File 13 - Keys: {'uri', 'text'} - Documents: 793



In [2]:
id_file = 1

rows = []
file_path = "../../Data/Original/" + str(id_file) + ".json"

with open(file_path) as f:
    data = json.load(f)  # data is a list of dict of the form: {'text':['...'], 'uri':['...']}

    print("Extracting text from {} documents in file '{}.json'...".format(len(data), id_file))
    for i, doc in enumerate(data):
        text = doc.get('text') # Get the text of the current doc
        if text is not None:
            row_dict = {'Text': text[0], 'Length': len(text[0])}
            rows.append(row_dict)

print("Creating dataframe...")
df = pd.DataFrame(rows)
print("Done !")

Extracting text from 11699 documents in file '1.json'...
Creating dataframe...
Done !


In [3]:
print("Max number of characters in a doc: {}".format(df.Length.max()))
print("Total number of docs: {}".format(len(df.index)))

Max number of characters in a doc: 2621440
Total number of docs: 11692


## 2. Data Processing

In order to use "create_pretraining_data.py" from BERT repository, the input must be a plain text file, with one sentence per line and one blank line between documents:

  * One sentence per line. These should ideally be actual sentences, not entire paragraphs or arbitrary spans of text. (Because we use the sentence boundaries for the "next sentence prediction" task).
  * Blank lines between documents. Document boundaries are needed so that the "next sentence prediction" task doesn't span between documents.
  
They advise to perform sentence segmentation with an off-the-shelf NLP toolkit such as spaCy.

In [4]:
df = df.head()

### 2.1. Corpus Cleaning

In [5]:
print("Cleaning corpus of text...")
df.Text = df.Text.replace('\s+', ' ', regex=True)  # Remove duplicate spaces
df.Text = df.Text.str.encode('ascii', 'ignore').str.decode('utf-8')   # Encode in ascii to remove weird characters such as \uf0a7
#df.Text = df.Text.str.lower()  # Lower case all strings
print("Done !")

Cleaning corpus of text...
Done !


### 2.2. Sentence segmentation

In [6]:
def spacy_segmentation(doc_text):
    """
    Given a string, segment it by sentences (performed by Spacy).
    """
    nlp = en_core_web_sm.load()
    nlp.max_length = 2621500  # because larger document has a size of 2621440 char
    doc = nlp(doc_text)
    sentences = list(doc.sents)
    return [sent.text for sent in sentences]


def nltk_segmentation(doc_text):
    """
    Given a string, segment it by sentences (performed by nltk).
    """
    return sent_tokenize(doc_text)

### 2.3. Sentence cleaning

In [7]:
def sent_cleaning(list_sent):
    """
    """
    # Remove sequences of special characters
    spec_char = set(',?;.:/=+%`¨*$€-_())°!§\'\"&@#~®†ºπ‡¬≈©◊~∞µ…÷≠<>^')
    list_sent = [' '.join([x for x in sent.split() if len(x)<=2 or not all(c in spec_char for c in x)]) for sent in list_sent]
    
    # If line begins with a number, remove the number   
    list_sent = [sent.split(maxsplit=1)[1] if (len(sent.split(maxsplit=1))>1 and sent.split(maxsplit=1)[0].isdigit()) else sent for sent in list_sent]
    
    # If line begins with a unique special char, remove that char
    list_sent = [sent.split(maxsplit=1)[1] if (len(sent.split(maxsplit=1))>1 and len(sent.split(maxsplit=1)[0])==1 and sent.split(maxsplit=1)[0] in spec_char) else sent for sent in list_sent]
    
    # Keep only sentences with more than 2 words and less than 200 words
    list_sent = [sent for sent in list_sent if (len(sent.split())>2 and len(sent.split())<200)]
    return list_sent

### 2.4. Plain text conversion

In [8]:
def sent_convert(list_sent):
    """
    Given a list of string sentences, return one unique string where
    sentences are separated by newlines.
    """
    return "\n".join(list_sent)   

### 2.5. Apply all cleaning functions 

In [9]:
print("Segmenting sentences...")
df['Text'] = df['Text'].progress_apply(nltk_segmentation)
print("Cleaning sentences...")
df['Text'] = df['Text'].progress_apply(sent_cleaning)
print("Concatenating all sentences...")
df['Text'] = df['Text'].progress_apply(sent_convert)

Segmenting sentences...


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Cleaning sentences...


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Concatenating all sentences...


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




### 2.6. Concatenate all documents

In [38]:
sentences = "\n\n".join(df["Text"])
print("Saving sentences to output file (total of {} words)...".format(len(sentences.split())))
output_file = "../../Data/Preprocessed/output.txt"
with open(output_file, "w+") as f:
    f.write(sentences)
print("Done !")

Saving sentences to output file (total of 102836 words)...
Done !


### 2.7. Check size of output files

In [39]:
directory = '../../Data/Preprocessed/'
files = os.listdir(directory).sort()
for filename in sorted(os.listdir(directory)):
    if filename.endswith(".txt"):
        file_path = directory + filename
        print("Size of '{}': {:.2f} GB".format(filename, os.path.getsize(file_path)/(1024*1024*1024)))

Size of 'all_text.txt': 16.16 GB
Size of 'output.txt': 0.00 GB
Size of 'text_1.txt': 1.84 GB
Size of 'text_10.txt': 0.92 GB
Size of 'text_11.txt': 2.14 GB
Size of 'text_12.txt': 3.05 GB
Size of 'text_13.txt': 0.00 GB
Size of 'text_2.txt': 0.13 GB
Size of 'text_3.txt': 0.36 GB
Size of 'text_4.txt': 2.95 GB
Size of 'text_5.txt': 1.13 GB
Size of 'text_6.txt': 0.96 GB
Size of 'text_7.txt': 0.87 GB
Size of 'text_8.txt': 1.09 GB
Size of 'text_9.txt': 0.73 GB
