## 1. Data extraction

Extract text contained in json files and save it in a dataframe for further pre-processing.

In [3]:
%%capture
from tqdm.notebook import tqdm
import json
import cudf
import numpy as np

In [4]:
# Check the different keys in the json file
for id_file in tqdm(range(1,14)):
    file_path = "./../Data/Original_data/" + str(id_file) + ".json"

    with open(file_path) as f:
        data = json.load(f)

        # Loop over each document
        keys = []
        for i, doc in enumerate(data):
            for key, value in doc.items():
                keys.append(key)

        myset = set(keys)
        print("File {} - Keys: {} - Documents: {}".format(id_file, myset, len(data)))

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))

File 1 - Keys: {'text', 'uri'} - Documents: 11699
File 2 - Keys: {'text', 'uri'} - Documents: 32072
File 3 - Keys: {'text', 'uri'} - Documents: 8225
File 4 - Keys: {'text', 'uri'} - Documents: 77258
File 5 - Keys: {'text', 'uri'} - Documents: 46079
File 6 - Keys: {'text', 'uri'} - Documents: 28106
File 7 - Keys: {'text', 'uri'} - Documents: 27391
File 8 - Keys: {'text', 'uri'} - Documents: 24143
File 9 - Keys: {'text', 'uri'} - Documents: 22223
File 10 - Keys: {'text', 'uri'} - Documents: 20979
File 11 - Keys: {'text', 'uri'} - Documents: 57160
File 12 - Keys: {'text', 'uri'} - Documents: 85900
File 13 - Keys: {'text', 'uri'} - Documents: 793



In [7]:
rows = []
for id_file in tqdm(range(1, 14)):
    file_path = "./../Data/Original_data/" + str(id_file) + ".json"

    with open(file_path) as f:
        print("Loading file {}...".format(id_file))
        data = json.load(f)  # data is a list of dict of the form: {'text':['...'], 'uri':['...']}

        print("Extracting text from {} documents in file...".format(len(data), id_file))
        for i, doc in enumerate(data):
            text = doc.get('text') # Get the text of the current doc
            if text is not None:
                row_dict = {'Text': text[0]}
                rows.append(row_dict)

# Create dataframe
print("Creating cudf dataframe...")
df = cudf.DataFrame(rows)  #df = pd.DataFrame(rows)
df.head()

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))

Loading file 1...


MemoryError: 

## 2. Data Processing

In order to use "create_pretraining_data.py" from BERT repository, the input must be a plain text file, with one sentence per line and one blank line between documents:

  * One sentence per line. These should ideally be actual sentences, not entire paragraphs or arbitrary spans of text. (Because we use the sentence boundaries for the "next sentence prediction" task).
  * Blank lines between documents. Document boundaries are needed so that the "next sentence prediction" task doesn't span between documents.
  
They advise to perform sentence segmentation with an off-the-shelf NLP toolkit such as spaCy.

In [69]:
# Sample df for testing
sample_df = df.head()
sample_df.iloc[0,0]

Unnamed: 0,Text
0,"Americas Headquarters: Cisco Systems, Inc., 17..."
1,For the latest version of the Cisco Small Busi...
2,"WebEx Meeting Center User Guide For Hosts, Pr..."
3,78-4019959-01 Rev D Prisma II 1550 nm SuperQA...
4,Release Notes for Cisco RV130/RV130W Firmware ...


### 2.1. Corpus Cleaning

In [None]:
sample_df.Text = sample_df.Text.replace('\s+', ' ')  # Remove duplicate spaces (regex=True)
sample_df.Text = sample_df.Text.str.encode('ascii', 'ignore').str.decode('utf-8')   # Encode in ascii to remove weird characters such as \uf0a7

### 2.2. Sentence segmentation

In [63]:
import spacy, en_core_web_sm
nlp = en_core_web_sm.load()

def sent_segmentation(doc_text):
    """
    Given a string, segment it by sentences.
    """
    doc = nlp(doc_text)
    sentences = list(doc.sents)
    return [sent.text for sent in sentences]

### 2.3. Sentence cleaning

In [66]:
def sent_cleaning(list_sent):
    """
    """
    # If line begins with a number, remove the number   
    list_sent = [sent.split(maxsplit=1)[1] if sent.split(maxsplit=1)[0].isdigit() else sent for sent in list_sent]

    # If line has more than 15 special characters, remove line
    spec_char = set(',?;.:/=+%`¨^*$€-_())°!§\'\"&@#~®†ºπ‡¬≈©◊~∞µ…÷≠')
    list_sent = [sent for sent in list_sent if max([sent.count(c) for c in spec_char]) < 15]

    # If line has only 2 words, remove it
    list_sent = [sent for sent in list_sent if len(sent.split()) > 2]
    
    return list_sent

### 2.4. Plain text conversion

In [60]:
def sent_convert(list_sent):
    """
    Given a list of string sentences, return one unique string where
    sentences are separated by newlines.
    """
    return "\n".join(list_sent)   

### 2.5. Apply functions 

In [None]:
df['Text'] = df['Text'].applymap(sent_segmentation)
df['Text'] = df['Text'].applymap(sent_cleaning)
df['Text'] = df['Text'].applymap(sent_convert)
df