In [1]:
%load_ext lab_black

In [4]:
import sys

sys.path.insert(0, "../scripts/")

import pandas as pd
import numpy as np

from tqdm.notebook import tqdm
from text import preprocess, split_to_sentences
from multiprocessing import Pool
import json

from sklearn.preprocessing import normalize
from collections import Counter
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
from sentence_transformers import SentenceTransformer

tqdm.pandas()
DATA_PATH = "../data"

In [5]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Andre\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [13]:
WordNetLemmatizer().lemmatize("gases", pos="v")

'gas'

In [3]:
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

In [7]:
articles = pd.read_csv(DATA_PATH + "/health_cng_v2.csv")

In [9]:
articles["text"] = articles["text"].progress_apply(preprocess)

  0%|          | 0/1249 [00:00<?, ?it/s]

In [10]:
articles.to_csv(DATA_PATH + "/health_cng_preprocessed_v2.csv", index=False)

In [4]:
def read_texts():
    texts = articles["text"].values
    return texts


def convert_to_sentences(texts):
    for text in texts:
        yield sent_tokenize(text)


def vectorize(texts):
    vector = model.encode(texts, batch_size=16).tolist()
    return vector


def parse_text(text):
    sentences = split_to_sentences(text)
    text_parsed = []

    for sent in sentences:
        sent = preprocess(sent.lower())
        if sent != "" and (4 < len(sent.split()) < 100):
            vector = vectorize(sent)
            text_parsed.append({"text": sent, "vector": vector})

    return text_parsed

In [5]:
texts = read_texts()

In [6]:
sentences = [] 
for text in tqdm(texts):
    sentences += parse_text(text)

  0%|          | 0/1249 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [34]:
sentences[10]

{'text': 'another type emissions hazardous human health soot particulate formation',
 'vector': [-0.0977109968662262,
  0.4155186116695404,
  0.42033642530441284,
  0.424037903547287,
  0.677020788192749,
  -0.002427336061373353,
  0.9192664623260498,
  0.5353001356124878,
  -0.07925142347812653,
  0.19932562112808228,
  -0.21950355172157288,
  -0.31974923610687256,
  -0.2526654005050659,
  -0.020827729254961014,
  0.16279976069927216,
  0.0676431730389595,
  -0.2834438681602478,
  -0.9480994343757629,
  0.10946166515350342,
  -0.169411763548851,
  0.147206649184227,
  0.2340112030506134,
  -0.19507628679275513,
  -0.07079533487558365,
  -0.06507627665996552,
  0.012404382228851318,
  -0.05919226258993149,
  0.326892226934433,
  0.3156225383281708,
  0.06353709101676941,
  0.24108751118183136,
  0.2441314160823822,
  -0.8218749761581421,
  0.5836692452430725,
  0.09377607703208923,
  -0.826277494430542,
  0.48843225836753845,
  0.2756808400154114,
  0.03934604674577713,
  -0.1824220865

In [4]:
# articles["original_text"] = articles["text"]
# articles["text"] = articles["text"].str.lower()

# articles.index = articles["pii"]
# articles.drop("pii", axis=1, inplace=True)
# articles.head()

Unnamed: 0_level_0,title,text,original_text
pii,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
S0016236121003306,"Spark ignition engine performance, standard em...",air/fuel ratio after top dead center before to...,air/fuel ratio after top dead center before to...
S0016236120330714,Characterization of the emission of particles ...,"in recent years, legislation introduced partic...","In recent years, legislation introduced partic..."
S2468227619307732,Technical and economic evaluation of the use o...,“gasoline and diesel are complicated mixtures ...,“Gasoline and diesel are complicated mixtures ...
S2352484721000202,Experimental optimization of engine performanc...,all data required for reproducing this researc...,All data required for reproducing this researc...
S1110016815001623,Integrated modeling for optimized regional tra...,although electric vehicles and plug-in hybrid ...,Although electric vehicles and plug-in hybrid ...


In [5]:
articles["text"] = articles["text"].progress_apply(preprocess)

  0%|          | 0/1249 [00:00<?, ?it/s]

In [6]:
articles.to_csv(DATA_PATH + "/health_cng_preprocessed_v2.csv", index=False)

# VECTORIZING

In [7]:
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
from sklearn.preprocessing import normalize
from collections import Counter

%%time
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("spacytextblob")

In [18]:
doc1 = nlp(articles["text"][0])
doc2 = nlp(articles["text"][1000])

In [19]:
doc1.similarity(doc2)

0.7522519162119523

In [9]:
def parse(article):
    text = article["text"]
    title = article["title"]
    pii = article["pii"]
    original_text = article["original_text"]

    try:
        parsed = nlp(text)
    except:
        return None

    vector = parsed.vector.tolist()
    tokens = list(set([token.lemma_ for token in parsed]))
    assessments = parsed._.assessments
    polarity = parsed._.polarity

    details = []
    for assessment in assessments:
        details.append(
            {
                "word": " ".join(assessment[0]),
                "polarity": assessment[1],
                "subjectivity": assessment[2],
            }
        )

    article = dict(
        text=text,
        title=title,
        vector=vector,
        tokens=tokens,
        details=details,
        polarity=polarity,
    )

    return article

In [10]:
descr = articles.progress_apply(parse, axis=1).values.tolist()

  0%|          | 0/1249 [00:00<?, ?it/s]

In [1]:
descr[0]

NameError: name 'descr' is not defined

In [47]:
with open(DATA_PATH + "/health_cng_parsed_v2.json", "w") as f:
    json.dump(descr, f)

In [24]:
articles_arr = []

with Pool(4) as pool:
    for article in tqdm(pool.imap_unordered(vectorize, zip(articles['text'].values, articles['title'].values)), total=len(articles)):
        articles_arr.append(article)

  0%|          | 0/1249 [00:00<?, ?it/s]

ValueError: [E088] Text of length 2092316 exceeds maximum of 1000000. The parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. This means long texts may cause memory allocation errors. If you're not using the parser or NER, it's probably safe to increase the `nlp.max_length` limit. The limit is in number of characters, so you can check whether your inputs are too long by checking `len(text)`.

In [49]:
articles_arr = [i for i in articles_arr if i is not None]
vectors = np.array([i.vector for i in articles_arr])
titles = [i.title for i in articles_arr]

In [58]:
ds = xr.Dataset({"vector": (("sample", "features"), vectors)},
          coords={'title': titles})

In [73]:
pd.DataFrame(articles_arr)

Unnamed: 0,text,vector,tokens,assessments,polarity,title
0,gasoline diesel complicated mixtures hydrocarb...,"[-0.047150087, 0.18867119, 0.07359996, -0.0674...","[gasoline, diesel, complicate, mixture, hydroc...","[([complicated], -0.5, 1.0, None), ([typically...",0.048875,Technical and economic evaluation of the use o...
1,recent years legislation introduced particle n...,"[-0.074192345, 0.21192631, -0.074391104, -0.01...","[recent, year, legislation, introduce, particl...","[([recent], 0.0, 0.25, None), ([light], 0.4, 0...",0.037692,Characterization of the emission of particles ...
2,data required reproducing research contained m...,"[-0.13159458, 0.27833435, -0.009150035, 0.0049...","[datum, require, reproduce, research, contain,...","[([past], -0.25, 0.25, None), ([available], 0....",0.089568,Experimental optimization of engine performanc...
3,polycyclic aromatic hydrocarbons pahs persiste...,"[-0.117442355, 0.17586903, -0.012104892, -0.00...","[polycyclic, aromatic, hydrocarbon, pahs, pers...","[([serious], -0.3333333333333333, 0.6666666666...",0.067681,Impact of CNG on emissions of PAHs and PCDDs/F...
4,background previously performed multistep geno...,"[-0.103354685, 0.06259379, -0.08855352, 0.0264...","[background, previously, perform, multistep, g...","[([previously], -0.16666666666666666, 0.166666...",0.098611,1879P: Potential role of RICTOR copy number ga...
...,...,...,...,...,...,...
1219,central serous chorioretinopathy cscr csc post...,"[-0.068067685, 0.08795447, -0.10777809, 0.0157...","[central, serous, chorioretinopathy, cscr, csc...","[([central], 0.0, 0.25, None), ([limited], -0....",0.074753,Central serous chorioretinopathy: Recent findi...
1220,recent years biomaterials gained increasing in...,"[-0.1293862, 0.15736759, -0.12811007, 0.015293...","[recent, year, biomaterial, gain, increase, in...","[([recent], 0.0, 0.25, None), ([various], 0.0,...",0.113958,Sol–gel based materials for biomedical applica...
1221,robert bourge facc annetine gelijns phd bartle...,"[-0.08598141, 0.16487788, -0.05573877, 0.01285...","[robert, bourge, facc, annetine, gelijns, phd,...","[([rose], 0.6, 0.95, None), ([young], 0.1, 0.4...",0.066831,Mechanical cardiac support 2000: current appli...
1222,understanding transport processes axisymmetric...,"[-0.013112347, 0.11722535, -0.13934952, 0.0116...","[understand, transport, process, axisymmetric,...","[([challenging], 0.5, 1.0, None), ([special], ...",0.084459,Anomalous transport


In [61]:
for i in ds:
    print(i)
    break

Vector


In [25]:
ds = xr.Dataset(
    {"foo": (("x", "y"), np.random.rand(4, 5))},
    coords={
        "x": [10, 20, 30, 40],
        "y": pd.date_range("2000-01-01", periods=5),
        "z": ("x", list("abcd")),
    },
)

KeyboardInterrupt: 

In [None]:
for text, title in tqdm(zip(articles['text'].values, articles['title'].values), total=articles.shape[0]):
    articles_arr.append(vectorize(text), title)

  0%|          | 0/1249 [00:00<?, ?it/s]

In [52]:
articles.iloc[1242]

pii                                      S1556086415336261
title                                    INVITED ABSTRACTS
text     PRESIDENTIAL SYMPOSIUM INCLUDING TOP RATED ABS...
Name: 1242, dtype: object

In [51]:
np.argmax([len(t) for t in articles.text])

1242

In [19]:
def vectorize(text):
    """Get the SpaCy vector corresponding to a text"""
    return nlp(text, disable=['parser', 'ner']).vector

In [20]:
vectors = articles['text'].progress_apply(vectorize)

  0%|          | 0/1249 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
vectors = np.stack(vectors.values)
vectors = normalize(vectors)

In [None]:
with open(DATA_PATH + '/vectors.npy', 'wb') as f:
    np.save(f, vectors)