## 1. Data extraction

Extract text contained in json files and save it in a dataframe for further pre-processing.

In [6]:
from tqdm.notebook import tqdm
tqdm.pandas()
import spacy, en_core_web_sm
import numpy as np
import pandas as pd
import json

In [4]:
# Check the different keys in the json file
for id_file in tqdm(range(1,14)):
    file_path = "../../Data/Original_data/" + str(id_file) + ".json"

    with open(file_path) as f:
        data = json.load(f)

        # Loop over each document
        keys = []
        for i, doc in enumerate(data):
            for key, value in doc.items():
                keys.append(key)

        myset = set(keys)
        print("File {} - Keys: {} - Documents: {}".format(id_file, myset, len(data)))

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))

File 1 - Keys: {'uri', 'text'} - Documents: 11699
File 2 - Keys: {'uri', 'text'} - Documents: 32072
File 3 - Keys: {'uri', 'text'} - Documents: 8225
File 4 - Keys: {'uri', 'text'} - Documents: 77258
File 5 - Keys: {'uri', 'text'} - Documents: 46079
File 6 - Keys: {'uri', 'text'} - Documents: 28106
File 7 - Keys: {'uri', 'text'} - Documents: 27391
File 8 - Keys: {'uri', 'text'} - Documents: 24143
File 9 - Keys: {'uri', 'text'} - Documents: 22223
File 10 - Keys: {'uri', 'text'} - Documents: 20979
File 11 - Keys: {'uri', 'text'} - Documents: 57160
File 12 - Keys: {'uri', 'text'} - Documents: 85900
File 13 - Keys: {'uri', 'text'} - Documents: 793



In [17]:
rows = []
for id_file in tqdm(range(1, 14)):
    file_path = "../../Data/Original_data/" + str(id_file) + ".json"

    with open(file_path) as f:
        data = json.load(f)  # data is a list of dict of the form: {'text':['...'], 'uri':['...']}

        print("Extracting text from {} documents in file '{}.json'...".format(len(data), id_file))
        for i, doc in enumerate(data):
            text = doc.get('text') # Get the text of the current doc
            if text is not None:
                row_dict = {'Text': text[0], 'Length': len(text[0])}
                rows.append(row_dict)

print("Creating dataframe...")
df = pd.DataFrame(rows)
print("Done !")

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))

Extracting text from 11699 documents in file '1.json'...
Extracting text from 32072 documents in file '2.json'...
Extracting text from 8225 documents in file '3.json'...
Extracting text from 77258 documents in file '4.json'...
Extracting text from 46079 documents in file '5.json'...
Extracting text from 28106 documents in file '6.json'...
Extracting text from 27391 documents in file '7.json'...
Extracting text from 24143 documents in file '8.json'...
Extracting text from 22223 documents in file '9.json'...
Extracting text from 20979 documents in file '10.json'...
Extracting text from 57160 documents in file '11.json'...
Extracting text from 85900 documents in file '12.json'...
Extracting text from 793 documents in file '13.json'...

Creating dataframe...
Done !


In [18]:
print("Max number of characters in a doc: {}".format(df.Length.max()))
print("Total number of docs: {}".format(len(df.index)))

Max number of characters in a doc: 2621440
Total number of docs: 442019


## 2. Data Processing

In order to use "create_pretraining_data.py" from BERT repository, the input must be a plain text file, with one sentence per line and one blank line between documents:

  * One sentence per line. These should ideally be actual sentences, not entire paragraphs or arbitrary spans of text. (Because we use the sentence boundaries for the "next sentence prediction" task).
  * Blank lines between documents. Document boundaries are needed so that the "next sentence prediction" task doesn't span between documents.
  
They advise to perform sentence segmentation with an off-the-shelf NLP toolkit such as spaCy.

### 2.1. Corpus Cleaning

In [19]:
df = df.head()
df

Unnamed: 0,Text,Length
0,"Americas Headquarters: Cisco Systems, Inc., 17...",165130
1,For the latest version of the Cisco Small Busi...,206
2,"WebEx Meeting Center User Guide For Hosts, Pr...",536673
3,78-4019959-01 Rev D Prisma II 1550 nm SuperQA...,21188
4,Release Notes for Cisco RV130/RV130W Firmware ...,2861


In [20]:
print("Cleaning corpus of text...")
df.Text = df.Text.replace('\s+', ' ', regex=True)  # Remove duplicate spaces
df.Text = df.Text.str.encode('ascii', 'ignore').str.decode('utf-8')   # Encode in ascii to remove weird characters such as \uf0a7
df.Text = df.Text.str.lower()  # Lower case all strings (I have noticed a better segmentation by doing that)
print("Done !")

Cleaning corpus of text...
Done !


### 2.2. Sentence segmentation

In [21]:
def sent_segmentation(doc_text):
    """
    Given a string, segment it by sentences.
    """
    nlp = en_core_web_sm.load()
    nlp.max_length = 2621500  # because larger document has a size of 2621440 char
    doc = nlp(doc_text)
    sentences = list(doc.sents)
    return [sent.text for sent in sentences]

### 2.3. Sentence cleaning

In [22]:
def sent_cleaning(list_sent):
    """
    """
    # If line begins with a number, remove the number   
    list_sent = [sent.split(maxsplit=1)[1] if (sent.split(maxsplit=1)[0].isdigit() and len(sent.split(maxsplit=1)) > 1) else sent for sent in list_sent]
    
    # If line begins with a special char, remove that char
    spec_char = set(',?;.:/=+%`¨*$€-_())°!§\'\"&@#~®†ºπ‡¬≈©◊~∞µ…÷≠<>^')
    list_sent = [sent.split(maxsplit=1)[1] if (len(sent.split(maxsplit=1)) > 1 and sent[0] in spec_char) else sent for sent in list_sent]
    
    # Keep only sentences that have less that 15 special characters
    list_sent = [sent for sent in list_sent if max([sent.count(c) for c in spec_char]) < 15]

    # Keep only sentences with more than 2 words
    list_sent = [sent for sent in list_sent if len(sent.split()) > 2]
    return list_sent

### 2.4. Plain text conversion

In [23]:
def sent_convert(list_sent):
    """
    Given a list of string sentences, return one unique string where
    sentences are separated by newlines.
    """
    return "\n".join(list_sent)   

### 2.5. Apply all cleaning functions 

In [24]:
print("Segmenting sentences...")
df['Text'] = df['Text'].progress_apply(sent_segmentation)
print("Cleaning sentences...")
df['Text'] = df['Text'].progress_apply(sent_cleaning)
print("Concatenating all sentences...")
df['Text'] = df['Text'].progress_apply(sent_convert)

Segmenting sentences...


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Cleaning sentences...


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Concatenating all sentences...


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




### 2.6. Concatenate all documents

In [25]:
final_text = "\n\n".join(df["Text"])
with open("../../Data/output.txt", "w+") as f:
    f.write(final_text)
print("DONE !")

DONE !
