In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import json
from pprint import pprint
from concurrent.futures import ProcessPoolExecutor
from itertools import chain
import nltk
import re
import os
from pathlib import Path


## 1. Set up file path and Read Files

In [3]:
GLOBAL_PATH="../training"

In [4]:
metadata_path = '{GLOBAL_PATH}/metadata.csv'.format(GLOBAL_PATH=GLOBAL_PATH)

meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})

In [4]:
meta_df.head(3)

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
0,zjufx4fo,b2897e1277f56641193a6db73825f707eed3e4c9,PMC,Sequence requirements for RNA strand transfer ...,10.1093/emboj/20.24.7220,PMC125340,11742998,unk,Nidovirus subgenomic mRNAs contain a leader se...,2001-12-17,"Pasternak, Alexander O.; van den Born, Erwin; ...",The EMBO Journal,,,True,True,custom_license,http://europepmc.org/articles/pmc125340?pdf=re...
1,ymceytj3,e3d0d482ebd9a8ba81c254cc433f314142e72174,PMC,"Crystal structure of murine sCEACAM1a[1,4]: a ...",10.1093/emboj/21.9.2076,PMC125375,11980704,unk,CEACAM1 is a member of the carcinoembryonic an...,2002-05-01,"Tan, Kemin; Zelus, Bruce D.; Meijers, Rob; Liu...",The EMBO Journal,,,True,True,custom_license,http://europepmc.org/articles/pmc125375?pdf=re...
2,wzj2glte,00b1d99e70f779eb4ede50059db469c65e8c1469,PMC,Synthesis of a novel hepatitis C virus protein...,10.1093/emboj/20.14.3840,PMC125543,11447125,no-cc,Hepatitis C virus (HCV) is an important human ...,2001-07-16,"Xu, Zhenming; Choi, Jinah; Yen, T.S.Benedict; ...",EMBO J,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...


## 2. Read BioXiv paper for testing training generator functions 

In [72]:
biorxiv = '/biorxiv_medrxiv'
bioxiv_json = glob.glob(f'{GLOBAL_PATH+biorxiv}/**/*.json', recursive=True)

In [26]:
bioxiv_json[0]

'../data/CORD-19-research-challenge/biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/f6b29be971089bfe0916c64ab9fbddcec38a7436.json'

In [29]:
print("There are: ",len(bioxiv_json)," pdf Json in BioXiv")
print("===================================================")
all_files = []
for filename in bioxiv_json:
    #filename = biorxiv_dir + filename
    file = json.load(open(filename, 'rb'))
    all_files.append(file)

file = all_files[0]
print("Dictionary keys:", file.keys())

There are:  2278  pdf Json in BioXiv
Dictionary keys: dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])


In [35]:
pprint(file['abstract'])

[{'cite_spans': [],
  'ref_spans': [],
  'section': 'Abstract',
  'text': 'Aims: To determine analytical capabilities of a commonly used '
          'faecal immunochemical test (FIT) to detect haemoglobin (Hb) in the '
          'context of NICE guidance DG30, and the likely use of FIT to '
          'reprioritise patients delayed by the COVID-19 pandemic.'},
 {'cite_spans': [],
  'ref_spans': [],
  'section': 'Abstract',
  'text': 'Methods: Data obtained from independent verification studies and '
          'clinical testing of the HM-JACKarc FIT method in routine primary '
          'care practice were analysed to derive analytical performance '
          'characteristics.'},
 {'cite_spans': [],
  'ref_spans': [],
  'section': 'Abstract',
  'text': 'Results: Detection capabilities for the FIT method were 0.5 µg/g '
          '(limit of blank), 1.1 (limit of detection) and 15.0 µg/g (limit of '
          'quantification). 31 of 33 (94%) non-homogenised specimens analysed '
          '

In [34]:
print("body_text content:")
pprint(file['body_text'][:2], depth=3)

body_text content:
[{'cite_spans': [{...}, {...}, {...}],
  'ref_spans': [],
  'section': 'Introduction',
  'text': 'Colorectal cancer is globally the third most incident malignancy '
          '(1) . It is surgically treatable with improved long-term outcomes '
          'if diagnosis is at an early stage (2) . Most developed countries, '
          'including the UK, operate colorectal screening programmes using '
          'faecal occult blood testing. Screen detected cancers benefit from '
          'early diagnosis and treatment, with associated improved survival '
          '(2) . Faecal immunochemical tests (FIT) have largely replaced the '
          'traditional guaiac based faecal occult blood tests due to the '
          'increased specificity of FIT.'},
 {'cite_spans': [{...}, {...}],
  'ref_spans': [],
  'section': 'Introduction',
  'text': 'To complement the UK Bowel Cancer Screening Programme (UKBCSP), the '
          '2017 DG30 NICE guidance (3) recommended the use of FIT

In [40]:
texts = [(di['section'], di['text']) for di in file['body_text']]
texts_di = {di['section']: "" for di in file['body_text']}
for section, text in texts:
    texts_di[section] += text

pprint(list(texts_di.keys()))

['Introduction',
 'Setting and analytical method',
 'Method detection capability estimates and immunoassay reproducibility',
 'Effects of sample homogenisation',
 'Stability of specimens within the collection device',
 'Within patient serial sampling',
 'Statistical analysis',
 'Detection capabilities and immunoassay reproducibility',
 'Discussion',
 'Implications for research and practice.',
 'Concluding remarks.']


In [43]:
body = ""

for section, text in texts_di.items():
    body += section
    body += "\n\n"
    body += text
    body += "\n\n"

print(body[:3000])

Introduction

Colorectal cancer is globally the third most incident malignancy (1) . It is surgically treatable with improved long-term outcomes if diagnosis is at an early stage (2) . Most developed countries, including the UK, operate colorectal screening programmes using faecal occult blood testing. Screen detected cancers benefit from early diagnosis and treatment, with associated improved survival (2) . Faecal immunochemical tests (FIT) have largely replaced the traditional guaiac based faecal occult blood tests due to the increased specificity of FIT.To complement the UK Bowel Cancer Screening Programme (UKBCSP), the 2017 DG30 NICE guidance (3) recommended the use of FIT for faecal haemoglobin (Hb) detection in patients presenting to primary care with low risk abdominal symptoms. The adoption of FIT in primary care has been slow, with notable variation in uptake and implementation across the UK (4). The Oxford University Hospitals Trust (OUH) adopted FIT prior to the DG30 guidanc

## 3. Flatten the Json file to create the training set 

In [71]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            # Abstract
            for entry in content['abstract']:
                self.abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'

#first_row = FileReader(bioxiv_json[0])
#print(first_row)

In [49]:
dict_ = {'paper_id': [], 'abstract': [], 'body_text': [], 'title': []}
for idx, entry in enumerate(bioxiv_json):
    if idx % (len(bioxiv_json) // 10) == 0:
        print(f'Processing index: {idx} of {len(bioxiv_json)}')
    
    try:
        content = FileReader(entry)
    except Exception as e:
        continue  # invalid paper format, skip
    
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    # no metadata, skip this paper
    if len(meta_data) == 0:
        continue
    
    dict_['abstract'].append(content.abstract)
    dict_['paper_id'].append(content.paper_id)
    dict_['body_text'].append(content.body_text)
    
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    
    
    # add the title information, add breaks when needed
    dict_['title'].append(meta_data['title'])

    
    
df_covid = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text', 'title'])
df_covid.head()

Processing index: 0 of 2278
Processing index: 227 of 2278
Processing index: 454 of 2278
Processing index: 681 of 2278
Processing index: 908 of 2278
Processing index: 1135 of 2278
Processing index: 1362 of 2278
Processing index: 1589 of 2278
Processing index: 1816 of 2278
Processing index: 2043 of 2278
Processing index: 2270 of 2278


Unnamed: 0,paper_id,abstract,body_text,title
0,f6b29be971089bfe0916c64ab9fbddcec38a7436,Aims: To determine analytical capabilities of ...,Colorectal cancer is globally the third most i...,37232 Faecal immunochemical testing (FIT): ...
1,005d189d5bd7ac01aee65e934fd3d5186a3f7b27,The rapid outbreak of the new Coronavirus pand...,The outbreak of infectious diseases has always...,36952 Relationship between Average Daily Te...
2,f905f78b32f63c6d14a79984dfb33f1b358b8ab4,New anti-AIDS treatments must be continually d...,"In the absence of a curative treatment, the hi...",37478 Multimerization of HIV-1 integrase hi...
3,607e0074d8ff40c272b958c2fe48793fedfc785e,,"the author/funder, who has granted medRxiv a l...",36290 Virus shedding patterns in nasopharyn...
4,72e25b728c6c62fb3a4e2c59c8ee48de4b5ee452,"Recently classified as a pandemic by WHO, nove...",The pandemic of COVID-19 is taking a troll in ...,37135 Phylogenetic Analysis of the Novel Co...


In [60]:
len(df_covid["body_text"])

2278

## 4. Cleaning the Text 

In [55]:
import nltk
from nltk.tokenize import sent_tokenize
import sentencepiece as spm

In [52]:
regex_tokenizer = nltk.RegexpTokenizer("\w+")

In [63]:
def normalize_text(text):
    # lowercase text
    text = str(text).lower()
    # remove non-UTF
    text = text.encode("utf-8", "ignore").decode()
    # remove punktuation symbols
    text = " ".join(regex_tokenizer.tokenize(text))
    return text

def sentencise_text(text):
    text = sent_tokenize(text)
    return text 

def count_lines(filename):
    count = 0
    with open(filename) as fi:
        for line in fi:
            count += 1
    return count

In [1]:
#normalize_text(df_covid["body_text"][0])

In [61]:
PRC_DATA_FPATH = "../data/pretrain/proc_dataset.txt" #@param {type: "string"}
with open(PRC_DATA_FPATH, "w",encoding="utf-8") as fo:
    for l in df_covid["body_text"]:
        fo.write(normalize_text(l)+"\n")

### saving sentence per line 

In [90]:
PRC_DATA_FPATH = "../data/pretrain/proc_sentence_dataset.txt" #@param {type: "string"}
sentencised =[]

with open(PRC_DATA_FPATH, "w",encoding="utf-8") as fo:
    for l in df_covid["body_text"][:10]:
        sentencised.append(sentencise_text(normalize_text(I))

        fo.write(sentencise_text(normalize_text(l))+"\n")
        fo.writelines("%s\n" % place for place in places_list)


SyntaxError: invalid syntax (<ipython-input-90-e4c15c597b3b>, line 8)

## 4. Run all dataset 

In [5]:
load_json = glob.glob('{GLOBAL_PATH}/**/*.json'.format(GLOBAL_PATH=GLOBAL_PATH), recursive=True)

In [6]:
len(load_json)

68204

In [7]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            # Abstract
            for entry in content['abstract']:
                self.abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'


In [9]:
dict_ = {'paper_id': [], 'abstract': [], 'body_text': [], 'title': []}
for idx, entry in enumerate(load_json):
    if idx % (len(load_json) // 10) == 0:
        print('Processing index: {idx} of '.format(idx=idx),{len(load_json)})
    
    try:
        content = FileReader(entry)
    except Exception as e:
        continue  # invalid paper format, skip
    
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    # no metadata, skip this paper
    if len(meta_data) == 0:
        continue
    
    dict_['abstract'].append(content.abstract)
    dict_['paper_id'].append(content.paper_id)
    dict_['body_text'].append(content.body_text)
    
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    
    
    # add the title information, add breaks when needed
    dict_['title'].append(meta_data['title'])

    
    
df_covid_all = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text', 'title'])
df_covid_all.head()

Processing index: 0 of  {68204}
Processing index: 6820 of  {68204}
Processing index: 13640 of  {68204}
Processing index: 20460 of  {68204}
Processing index: 27280 of  {68204}
Processing index: 34100 of  {68204}
Processing index: 40920 of  {68204}
Processing index: 47740 of  {68204}
Processing index: 54560 of  {68204}
Processing index: 61380 of  {68204}
Processing index: 68200 of  {68204}


Unnamed: 0,paper_id,abstract,body_text,title
0,2cd9b37d110968368ce6f837e002788ac6158af8,Background: Tuberculosis is a leading cause of...,Tuberculosis (TB) is one of the top ten causes...,12096 Under-reporting of TB cases and assoc...
1,298d325e27d8cafcd150d56a199a5ce099ee2b07,The emergence of severe acute respiratory synd...,The Chinese National Influenza Center of the C...,24962 A ten-year China-US laboratory collab...
2,6e2b81ea145f20bad9129013f611181c0fe4ad8a,Newcastle disease virus (NDV) infection causes...,Newcastle disease (ND) is a highly contagious ...,16626 Newcastle disease virus RNA-induced I...
3,aefd921eef67855fd84f460502e9e7277aeb92a1,Globalization has been accompanied by the rapi...,Working conditions for health workers are unde...,2071 Collaboration between infection contro...
4,7b0510f8258fc50bcff6f0e1a66c54c04093f9d1,,"With a 2000-year medicinal history, Radix Bupl...",9870 A systematic review of the active saik...


In [10]:
df_covid_all.shape

(41117, 4)

### 4.1 Remove duplicates and NaN

In [11]:
#df_covid_all.drop_duplicates(['title'], inplace=True)
#df_covid_all.dropna(subset=['body_text'], inplace=True)
df_covid_new = df_covid_all[~df_covid_all['title'].apply(tuple).duplicated()]


In [12]:
df_covid_new.shape

(40694, 4)

### 4.2 Apply preprocessig 

In [13]:
def preprocessing(text):
    # remove single characters repeated at least 3 times for spacing error (e.g. s u m m a r y)
    text = re.sub(r'(\w\s){3,}', ' ', text)
    # replace tags (e.g. [NUM1]) with whitespace
    text = re.sub(r'\[[\d\,\s]+\]\s', ' ',text)
    # correctly spacing the tokens
    text = re.sub(r' {2,}', ' ', text)
    text = re.sub(r'\.{2,}', '.', text)
    # return lowercase text
    return text.lower()

In [14]:
df_covid_new['body_text'] = df_covid_new['body_text'].apply(preprocessing)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


### 4.3 Search for covid related paper only (optional)

In [15]:
covid_terms =['covid', 'coronavirus disease 19', 'sars cov 2', '2019 ncov', '2019ncov', '2019 n cov', '2019n cov',
              'ncov 2019', 'n cov 2019', 'coronavirus 2019', 'wuhan pneumonia', 'wuhan virus', 'wuhan coronavirus',
              'coronavirus 2', 'covid-19', 'SARS-CoV-2', '2019-nCov']
covid_terms = [preprocessing(elem) for elem in covid_terms]
covid_terms = re.compile('|'.join(covid_terms))


In [16]:
covid_terms

re.compile(r'covid|coronavirus disease 19|sars cov 2|2019 ncov|2019ncov|2019 n cov|2019n cov|ncov 2019|n cov 2019|coronavirus 2019|wuhan pneumonia|wuhan virus|wuhan coronavirus|coronavirus 2|covid-19|sars-cov-2|2019-ncov',
re.UNICODE)

In [17]:
def checkCovid(text, covid_terms):
    return bool(covid_terms.search(text))

df_covid_new['is_covid'] = df_covid_new['body_text'].apply(checkCovid, covid_terms=covid_terms)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [18]:
df_covid_only = df_covid_new[df_covid_new['is_covid']==True]
len(df_covid_only)

5384

In [19]:
df_covid_only.shape

(5384, 5)

## 5. Saving Training set 

### 5.1 use nltk to sentencize the corpus 

In [20]:
nltk.download('punkt')

extra_abbreviations = ['ps',  'inc', 'Corp', 'Ltd', 'Co', 'pkt', 'Dz.Ap', 'Jr', 'jr', 'sp', 'Sp', 'poj',  'pseud', 'krypt', 'sygn', 'Dz.U', 'ws', 'itd', 'np', 'sanskryt', 'nr', 'gł', 'Takht', 'tzw', 't.zw', 'ewan', 'tyt', 'oryg', 't.j', 'vs', 'l.mn', 'l.poj' ]

position_abbrev = ['Ks', 'Abp', 'abp','bp','dr', 'kard', 'mgr', 'prof', 'zwycz', 'hab', 'arch', 'arch.kraj', 'B.Sc', 'Ph.D', 'lek', 'med', 'n.med', 'bł', 'św', 'hr', 'dziek' ]

quantity_abbrev = [ 'mln', 'obr./min','km/godz', 'godz', 'egz', 'ha', 'j.m', 'cal', 'obj', 'alk', 'wag' ] # not added: tys.

actions_abbrev = ['tłum','tlum','zob','wym', 'pot', 'ww', 'ogł', 'wyd', 'min', 'm.i', 'm.in', 'in', 'im','muz','tj', 'dot', 'wsp', 'właść', 'właśc', 'przedr', 'czyt', 'proj', 'dosł', 'hist', 'daw', 'zwł', 'zaw' ]

place_abbrev = ['Śl', 'płd', 'geogr']

lang_abbrev = ['jęz','fr','franc', 'ukr', 'ang', 'gr', 'hebr', 'czes', 'pol', 'niem', 'arab', 'egip', 'hiszp', 'jap', 'chin', 'kor', 'tyb', 'wiet', 'sum']

military_abbrev = ['kpt', 'kpr', 'obs', 'pil', 'mjr','płk', 'dypl', 'pp', 'gw', 'dyw', 'bryg', 'ppłk', 'mar', 'marsz', 'rez', 'ppor', 'DPanc', 'BPanc', 'DKaw', 'p.uł']

extra_abbreviations= extra_abbreviations + position_abbrev + quantity_abbrev + place_abbrev + actions_abbrev + place_abbrev + lang_abbrev+military_abbrev

sentence_tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle')
sentence_tokenizer._params.abbrev_types.update(extra_abbreviations)

sent_tokenize = sentence_tokenizer.tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     /home/xcs224u_student/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [24]:
print(df_covid_only['body_text'][:1])

4    with a 2000-year medicinal history, radix bupl...
Name: body_text, dtype: object


In [26]:
def flatten(iterable):
    return chain.from_iterable(iterable)

def process_line(line):
    try:
        #doc = json.loads(line)
        txt = re.sub("\s+", " ", line)
        sentences = [s for s in sent_tokenize(txt)]
        windowed_sentences = []
        for snt in range(len(sentences)):
            windowed_sentences.append(" ".join(sentences[snt: snt + 4]))
        return windowed_sentences
    except:
        # print(f"Could not parse line \n{line}\n")
        return []

"""
def process_file(file_path):
    print(f"Processing {file_path}")
    lines = Path(file_path).read_text("utf-8").split("\n")
    with ProcessPoolExecutor(10) as pool:
        return list(flatten(pool.map(process_line, lines)))

"""


'\ndef process_file(file_path):\n    print(f"Processing {file_path}")\n    lines = Path(file_path).read_text("utf-8").split("\n")\n    with ProcessPoolExecutor(10) as pool:\n        return list(flatten(pool.map(process_line, lines)))\n\n'

In [None]:
PRC_DATA_FPATH = "pretrain/allpdf.train.sliding4-v1.txt" #@param {type: "string"}
with open(PRC_DATA_FPATH, "w",encoding="utf-8") as fo:
    _ = list(flatten(df_covid_new["body_text"].apply(process_line)))
    for line in _:
        fo.write("\n"+line)

In [27]:
temp = []
temp = df_covid_only["body_text"][:2].apply(process_line)

In [2]:
"""
for line in list(flatten(temp)):
    print(line)
    print("+==================")
"""




In [70]:
#use when processing txt documents 
"""
buffer, BUFFER_SIZE = [], 100000
with open("biorxiv_medrxiv.train.sliding4-v2.txt", "wt") as file:
    for sentence in enumerate(flatten(process_file(f) for f in files)):
        if len(buffer) >= BUFFER_SIZE:
            file.write("\n".join(buffer))
            buffer.clear()
            print(i, end="\r")
        buffer.append(sentence)
    if len(buffer) > 0:
        file.write("\n".join(buffer))
        buffer.clear()
"""


'\nbuffer, BUFFER_SIZE = [], 100000\nwith open("biorxiv_medrxiv.train.sliding4-v2.txt", "wt") as file:\n    for sentence in enumerate(flatten(process_file(f) for f in files)):\n        if len(buffer) >= BUFFER_SIZE:\n            file.write("\n".join(buffer))\n            buffer.clear()\n            print(i, end="\r")\n        buffer.append(sentence)\n    if len(buffer) > 0:\n        file.write("\n".join(buffer))\n        buffer.clear()\n'

In [112]:
%ls -la pretrain

total 722480
drwxrwxr-x 3 xcs224u_student xcs224u_student      4096 May  1 07:55 [0m[01;34m.[0m/
drwxrwxr-x 4 xcs224u_student xcs224u_student      4096 May  1 07:56 [01;34m..[0m/
-rw-rw-r-- 1 xcs224u_student xcs224u_student       378 Apr 29 08:43 config.json
-rw-rw-r-- 1 xcs224u_student xcs224u_student 696485376 May  1 07:55 covidonly.train.sliding4-v1.txt
-rw-rw-r-- 1 xcs224u_student xcs224u_student  43303482 Apr 29 08:43 proc_dataset.txt
-rw-rw-r-- 1 xcs224u_student xcs224u_student        16 Apr 29 08:43 tokenizer_config.json
drwxrwxr-x 2 xcs224u_student xcs224u_student      4096 Apr 29 08:43 [01;34mtokenziner[0m/


In [33]:
file= 'pretrain/covidonly.train.sliding4-v1.txt'

In [34]:
f = open(file, "r")
print(f.read(100))


with a 2000-year medicinal history, radix bupleuri (chai hu in chinese) is believed to be one of th


In [5]:
print(lines[100:])

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

