## 1. Data extraction

Extract text contained in json files and save it in a dataframe for further pre-processing.

In [1]:
import os
import json
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
tqdm.pandas()

#import spacy, en_core_web_sm
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/antoloui/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Check the different keys in the json file
length=0
#for id_file in tqdm(range(1,14)):
for id_file in range(1,14):
    file_path = "/raid/antoloui/Master-thesis/Data/Original/" + str(id_file) + ".json"

    with open(file_path) as f:
        data = json.load(f)

        # Loop over each document
        keys = []
        for i, doc in enumerate(data):
            for key, value in doc.items():
                keys.append(key)

        myset = set(keys)
        print("File {} - Keys: {} - Documents: {}".format(id_file, myset, len(data), length/len(data)))

File 1 - Keys: {'uri', 'text'} - Documents: 11699 - Length: 0.0
File 2 - Keys: {'uri', 'text'} - Documents: 32072 - Length: 0.0
File 3 - Keys: {'uri', 'text'} - Documents: 8225 - Length: 0.0
File 4 - Keys: {'uri', 'text'} - Documents: 77258 - Length: 0.0
File 5 - Keys: {'uri', 'text'} - Documents: 46079 - Length: 0.0
File 6 - Keys: {'uri', 'text'} - Documents: 28106 - Length: 0.0
File 7 - Keys: {'uri', 'text'} - Documents: 27391 - Length: 0.0
File 8 - Keys: {'uri', 'text'} - Documents: 24143 - Length: 0.0
File 9 - Keys: {'uri', 'text'} - Documents: 22223 - Length: 0.0
File 10 - Keys: {'uri', 'text'} - Documents: 20979 - Length: 0.0
File 11 - Keys: {'uri', 'text'} - Documents: 57160 - Length: 0.0
File 12 - Keys: {'uri', 'text'} - Documents: 85900 - Length: 0.0
File 13 - Keys: {'uri', 'text'} - Documents: 793 - Length: 0.0


In [4]:
id_file = 1

rows = []
file_path = "/raid/antoloui/Master-thesis/Data/Original/" + str(id_file) + ".json"

with open(file_path) as f:
    data = json.load(f)  # data is a list of dict of the form: {'text':['...'], 'uri':['...']}

    print("Extracting text from {} documents in file '{}.json'...".format(len(data), id_file))
    for i, doc in enumerate(data):
        if doc.get('text') is not None:
            text = ' '.join(doc.get('text')) # Flatten list of strings
            row_dict = {'Text': text, 'Length': len(text)}
            rows.append(row_dict)

print("Creating dataframe...")
df = pd.DataFrame(rows)
print("Done !")

Extracting text from 11699 documents in file '1.json'...
Creating dataframe...
Done !


In [3]:
print("Max number of characters in a doc: {}".format(df.Length.max()))
print("Total number of docs: {}".format(len(df.index)))

Max number of characters in a doc: 2621440
Total number of docs: 11692


## 2. Data Processing

In order to use "create_pretraining_data.py" from BERT repository, the input must be a plain text file, with one sentence per line and one blank line between documents:

  * One sentence per line. These should ideally be actual sentences, not entire paragraphs or arbitrary spans of text. (Because we use the sentence boundaries for the "next sentence prediction" task).
  * Blank lines between documents. Document boundaries are needed so that the "next sentence prediction" task doesn't span between documents.
  
They advise to perform sentence segmentation with an off-the-shelf NLP toolkit such as spaCy.

In [4]:
df = df.head()

### 2.1. Corpus Cleaning

In [5]:
print("Cleaning corpus of text...")
df.Text = df.Text.replace('\s+', ' ', regex=True)  # Remove duplicate spaces
df.Text = df.Text.str.encode('ascii', 'ignore').str.decode('utf-8')   # Encode in ascii to remove weird characters such as \uf0a7
#df.Text = df.Text.str.lower()  # Lower case all strings
print("Done !")

Cleaning corpus of text...
Done !


### 2.2. Sentence segmentation

In [6]:
def spacy_segmentation(doc_text):
    """
    Given a string, segment it by sentences (performed by Spacy).
    """
    nlp = en_core_web_sm.load()
    nlp.max_length = 2621500  # because larger document has a size of 2621440 char
    doc = nlp(doc_text)
    sentences = list(doc.sents)
    return [sent.text for sent in sentences]


def nltk_segmentation(doc_text):
    """
    Given a string, segment it by sentences (performed by nltk).
    """
    return sent_tokenize(doc_text)

### 2.3. Sentence cleaning

In [7]:
def sent_cleaning(list_sent):
    """
    """
    # Remove sequences of special characters
    spec_char = set(',?;.:/=+%`¨*$€-_())°!§\'\"&@#~®†ºπ‡¬≈©◊~∞µ…÷≠<>^')
    list_sent = [' '.join([x for x in sent.split() if len(x)<=2 or not all(c in spec_char for c in x)]) for sent in list_sent]
    
    # If line begins with a number, remove the number   
    list_sent = [sent.split(maxsplit=1)[1] if (len(sent.split(maxsplit=1))>1 and sent.split(maxsplit=1)[0].isdigit()) else sent for sent in list_sent]
    
    # If line begins with a unique special char, remove that char
    list_sent = [sent.split(maxsplit=1)[1] if (len(sent.split(maxsplit=1))>1 and len(sent.split(maxsplit=1)[0])==1 and sent.split(maxsplit=1)[0] in spec_char) else sent for sent in list_sent]
    
    # Keep only sentences with more than 2 words and less than 200 words
    list_sent = [sent for sent in list_sent if (len(sent.split())>2 and len(sent.split())<200)]
    return list_sent

### 2.4. Plain text conversion

In [8]:
def sent_convert(list_sent):
    """
    Given a list of string sentences, return one unique string where
    sentences are separated by newlines.
    """
    return "\n".join(list_sent)   

### 2.5. Apply all cleaning functions 

In [9]:
print("Segmenting sentences...")
df['Text'] = df['Text'].progress_apply(nltk_segmentation)
print("Cleaning sentences...")
df['Text'] = df['Text'].progress_apply(sent_cleaning)
print("Concatenating all sentences...")
df['Text'] = df['Text'].progress_apply(sent_convert)

Segmenting sentences...


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Cleaning sentences...


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Concatenating all sentences...


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




### 2.6. Concatenate all documents

In [38]:
sentences = "\n\n".join(df["Text"])
print("Saving sentences to output file (total of {} words)...".format(len(sentences.split())))
output_file = "../../Data/Preprocessed/output.txt"
with open(output_file, "w+") as f:
    f.write(sentences)
print("Done !")

Saving sentences to output file (total of 102836 words)...
Done !


### 2.7. Check size of output files

In [3]:
directory = '../../Data/Preprocessed/'
files = os.listdir(directory).sort()
for filename in sorted(os.listdir(directory)):
    if filename.endswith(".txt"):
        file_path = directory + filename
        print("Size of '{}': {:.4f} GB".format(filename, os.path.getsize(file_path)/(1024*1024*1024)))

Size of 'all_text.txt': 16.1553 GB
Size of 'output.txt': 0.0006 GB
Size of 'text_1.txt': 1.8405 GB
Size of 'text_10.txt': 0.9190 GB
Size of 'text_11.txt': 2.1406 GB
Size of 'text_12.txt': 3.0498 GB
Size of 'text_13.txt': 0.0011 GB
Size of 'text_2.txt': 0.1328 GB
Size of 'text_3.txt': 0.3557 GB
Size of 'text_4.txt': 2.9462 GB
Size of 'text_5.txt': 1.1260 GB
Size of 'text_6.txt': 0.9585 GB
Size of 'text_7.txt': 0.8667 GB
Size of 'text_8.txt': 1.0881 GB
Size of 'text_9.txt': 0.7303 GB


In [27]:
data_dir = '/raid/antoloui/Master-thesis/Data/Cleaned/New_cleaning/split_cleaned_13.json'
with open(data_dir) as ifile:
    for doc in ifile.readlines():
        parsed = json.loads(doc)
        print(parsed['text'])
    

Home Skip to content Skip to footer Worldwide [change] Welcome, Account Log Out My Cisco Cisco.com Worldwide Home Products & Services (menu) Support (menu) How to Buy (menu) Training & Events (menu) Partners (menu) Employees (menu) Guest Search Support Technology Support LAN Emulation (LANE) LAN Emulation (LANE) is a technology that allows an ATM network to function as a LAN backbone.
The ATM network must provide multicast and broadcast support, address mapping (MAC-to-ATM), SVC management, and a usable packet format.
LANE also defines Ethernet and Token Ring ELANs.
Technology Information Technologies Digests (1) Technology Briefs (1) Design Design Guides (1) Design TechNotes (1) Configure Configuration Examples and TechNotes (9) Configuration Guides (3) Troubleshoot and Alerts Troubleshooting Guides (4) Troubleshooting TechNotes (10) Choose another technology Status Orderable How to Buy End-of-Sale Date None Announced End-of-Support Date None Announced Let Us Help Technical Support Op