## NPL Processing
#### O seguinte arquivo usará NPL por meio da biblioteca 'spacy' para melhor entender os padrões dos abstracts dos artigos

In [2]:
# Importação de Bibliotecas 
import pandas as pd 
import spacy
from spacy import displacy
from nltk.tokenize import TreebankWordTokenizer

In [3]:
#constantes
t = TreebankWordTokenizer()
nlp = spacy.load('en_core_web_sm')
palavras_chave = ["alcohol", "cancer", "consumption","ethanol"]

#### Importar os artigos

In [5]:
df = pd.read_excel("df_completo.xlsx")

### Filtro

In [6]:
artigos = df.iloc[0:1000]
abstracts = artigos["Abstract"].dropna()
index = abstracts.index

In [7]:
list_found_all = []

for texto, num in zip(abstracts, index):

    # Split the text into tokens
    tooks = nlp(texto)
    
    # Convert tokens to lowercase
    tooks_ = [t.lower_ for t in tooks]
    #print(tooks_)
    
    # Check if all words in palavras_chave are present in tooks_
    if all(word in tooks_ for word in palavras_chave):
        list_found_all.append(num)

    # If all keywords are found, this is a high-interest article 

# Print the articles that contain all keywords
print(f"Articles that contain all keywords (index): {list_found_all}")



Articles that contain all keywords (index): [433, 468, 489, 507, 529, 541, 558, 580, 583, 628, 717, 770, 782, 800, 806, 807, 825, 849, 882, 946, 959, 961, 967, 971, 987]


### Tokenization
#### Teste com um abstract

In [6]:
s = df['Abstract'][1]
print(s)

Plasma carotenoid responses were determined in 36 healthy men and women before and after being fed controlled diets with a moderate amount of fat (26% of total energy) and a high carotenoid content (approximate to 16 mg/d) for two 15-d periods. in addition, broccoli (205 g/d) was provided either during the first or the second 15-d residency period in a crossover design. Plasma was digested with lipase and cholesterol esterase, and carotenoids were extracted and measured by using HPLC. Three oxygenated carotenoids (lutein, zeaxanthin, and cryptoxanthin), three hydrocarbon carbon carotenoids (alpha-carotene, all-trans-beta-carotene, and 13-cis-beta-carotene), and four geometric isomers of lycopene (15-cis-, 13-cis-, 9-cis-, and all-trans-lycopene) were separated by using a C30 carotenoid column. A small unidentified peak coeluted with standard 9-cis-beta-carotene and was identified as zeta-carotene (lambda(max) = 400 nm). The concentrations of plasma lutein, cryptoxanthin, alpha-carotene

In [7]:
doc = nlp(s)
print([sent for sent in doc.sents])
#print([(t.text, t.i) for t in doc.sents])

[Plasma carotenoid responses were determined in 36 healthy men and women before and after being fed controlled diets with a moderate amount of fat (26% of total energy) and a high carotenoid content (approximate to 16 mg/d) for two 15-d periods., in addition, broccoli (205 g/d) was provided either during the first or the second 15-d residency period in a crossover design., Plasma was digested with lipase and cholesterol esterase, and carotenoids were extracted and measured by using HPLC., Three oxygenated carotenoids (lutein, zeaxanthin, and cryptoxanthin), three hydrocarbon carbon carotenoids (alpha-carotene, all-trans-beta-carotene, and 13-cis-beta-carotene), and four geometric isomers of lycopene (15-cis-, 13-cis-, 9-cis-, and all-trans-lycopene) were separated by using a C30 carotenoid column., A small unidentified peak coeluted with standard 9-cis-beta-carotene and was identified as zeta-carotene (lambda(max) = 400 nm)., The concentrations of plasma lutein, cryptoxanthin, alpha-ca

### Pré-Processamento

#### Case-folding

In [8]:
print([t.lower_ if not t.is_sent_start else t for t in doc])

[Plasma, 'carotenoid', 'responses', 'were', 'determined', 'in', '36', 'healthy', 'men', 'and', 'women', 'before', 'and', 'after', 'being', 'fed', 'controlled', 'diets', 'with', 'a', 'moderate', 'amount', 'of', 'fat', '(', '26', '%', 'of', 'total', 'energy', ')', 'and', 'a', 'high', 'carotenoid', 'content', '(', 'approximate', 'to', '16', 'mg', '/', 'd', ')', 'for', 'two', '15', '-', 'd', 'periods', '.', in, 'addition', ',', 'broccoli', '(', '205', 'g', '/', 'd', ')', 'was', 'provided', 'either', 'during', 'the', 'first', 'or', 'the', 'second', '15', '-', 'd', 'residency', 'period', 'in', 'a', 'crossover', 'design', '.', Plasma, 'was', 'digested', 'with', 'lipase', 'and', 'cholesterol', 'esterase', ',', 'and', 'carotenoids', 'were', 'extracted', 'and', 'measured', 'by', 'using', 'hplc', '.', Three, 'oxygenated', 'carotenoids', '(', 'lutein', ',', 'zeaxanthin', ',', 'and', 'cryptoxanthin', ')', ',', 'three', 'hydrocarbon', 'carbon', 'carotenoids', '(', 'alpha', '-', 'carotene', ',', 'all

#### Stop Word Removal

##### Utilizando o repertório de Stop Words do Spacy:

In [9]:
print([t for t in doc if not t.is_stop])

[Plasma, carotenoid, responses, determined, 36, healthy, men, women, fed, controlled, diets, moderate, fat, (, 26, %, total, energy, ), high, carotenoid, content, (, approximate, 16, mg, /, d, ), 15, -, d, periods, ., addition, ,, broccoli, (, 205, g, /, d, ), provided, second, 15, -, d, residency, period, crossover, design, ., Plasma, digested, lipase, cholesterol, esterase, ,, carotenoids, extracted, measured, HPLC, ., oxygenated, carotenoids, (, lutein, ,, zeaxanthin, ,, cryptoxanthin, ), ,, hydrocarbon, carbon, carotenoids, (, alpha, -, carotene, ,, -, trans, -, beta, -, carotene, ,, 13, -, cis, -, beta, -, carotene, ), ,, geometric, isomers, lycopene, (, 15, -, cis-, ,, 13, -, cis-, ,, 9, -, cis-, ,, -, trans, -, lycopene, ), separated, C30, carotenoid, column, ., small, unidentified, peak, coeluted, standard, 9, -, cis, -, beta, -, carotene, identified, zeta, -, carotene, (, lambda(max, ), =, 400, nm, ), ., concentrations, plasma, lutein, ,, cryptoxanthin, ,, alpha, -, carotene, 

#### Lemmatization

##### Aplicando _Lemmatization_ no abstract:

In [10]:
[(t.text, t.lemma_) for t in doc]

[('Plasma', 'plasma'),
 ('carotenoid', 'carotenoid'),
 ('responses', 'response'),
 ('were', 'be'),
 ('determined', 'determine'),
 ('in', 'in'),
 ('36', '36'),
 ('healthy', 'healthy'),
 ('men', 'man'),
 ('and', 'and'),
 ('women', 'woman'),
 ('before', 'before'),
 ('and', 'and'),
 ('after', 'after'),
 ('being', 'be'),
 ('fed', 'feed'),
 ('controlled', 'control'),
 ('diets', 'diet'),
 ('with', 'with'),
 ('a', 'a'),
 ('moderate', 'moderate'),
 ('amount', 'amount'),
 ('of', 'of'),
 ('fat', 'fat'),
 ('(', '('),
 ('26', '26'),
 ('%', '%'),
 ('of', 'of'),
 ('total', 'total'),
 ('energy', 'energy'),
 (')', ')'),
 ('and', 'and'),
 ('a', 'a'),
 ('high', 'high'),
 ('carotenoid', 'carotenoid'),
 ('content', 'content'),
 ('(', '('),
 ('approximate', 'approximate'),
 ('to', 'to'),
 ('16', '16'),
 ('mg', 'mg'),
 ('/', '/'),
 ('d', 'd'),
 (')', ')'),
 ('for', 'for'),
 ('two', 'two'),
 ('15', '15'),
 ('-', '-'),
 ('d', 'd'),
 ('periods', 'period'),
 ('.', '.'),
 ('in', 'in'),
 ('addition', 'addition'),


### Part-of-Speech Tagging, Named Entity Recognition, e Parsing.

#### Part-of-Speech Tagging (POS)

##### Aplicando POS com "tag" por ser melhor explicado

In [11]:
[(t.text, t.tag_) for t in doc]

[('Plasma', 'NN'),
 ('carotenoid', 'NN'),
 ('responses', 'NNS'),
 ('were', 'VBD'),
 ('determined', 'VBN'),
 ('in', 'IN'),
 ('36', 'CD'),
 ('healthy', 'JJ'),
 ('men', 'NNS'),
 ('and', 'CC'),
 ('women', 'NNS'),
 ('before', 'RB'),
 ('and', 'CC'),
 ('after', 'IN'),
 ('being', 'VBG'),
 ('fed', 'VBN'),
 ('controlled', 'VBN'),
 ('diets', 'NNS'),
 ('with', 'IN'),
 ('a', 'DT'),
 ('moderate', 'JJ'),
 ('amount', 'NN'),
 ('of', 'IN'),
 ('fat', 'NN'),
 ('(', '-LRB-'),
 ('26', 'CD'),
 ('%', 'NN'),
 ('of', 'IN'),
 ('total', 'JJ'),
 ('energy', 'NN'),
 (')', '-RRB-'),
 ('and', 'CC'),
 ('a', 'DT'),
 ('high', 'JJ'),
 ('carotenoid', 'NN'),
 ('content', 'NN'),
 ('(', '-LRB-'),
 ('approximate', 'JJ'),
 ('to', 'IN'),
 ('16', 'CD'),
 ('mg', 'NNP'),
 ('/', 'SYM'),
 ('d', 'CD'),
 (')', '-RRB-'),
 ('for', 'IN'),
 ('two', 'CD'),
 ('15', 'CD'),
 ('-', 'HYPH'),
 ('d', 'NN'),
 ('periods', 'NNS'),
 ('.', '.'),
 ('in', 'IN'),
 ('addition', 'NN'),
 (',', ','),
 ('broccoli', 'NNS'),
 ('(', '-LRB-'),
 ('205', 'CD'),
 ('g

In [12]:
spacy.explain('NNS')

'noun, plural'

#### Name Entity Recognition (NER)

##### Aplicando NER com restrição de entidade:

In [13]:
print([(t.text, t.ent_type_) for t in doc if t.ent_type != 0])

[('Plasma', 'PERSON'), ('carotenoid', 'PERSON'), ('36', 'CARDINAL'), ('fed', 'ORG'), ('26', 'PERCENT'), ('%', 'PERCENT'), ('16', 'QUANTITY'), ('mg', 'QUANTITY'), ('/', 'QUANTITY'), ('two', 'CARDINAL'), ('15', 'CARDINAL'), ('205', 'CARDINAL'), ('first', 'ORDINAL'), ('second', 'ORDINAL'), ('15', 'CARDINAL'), ('Plasma', 'PERSON'), ('HPLC', 'ORG'), ('Three', 'CARDINAL'), ('zeaxanthin', 'PERSON'), ('cryptoxanthin', 'PERSON'), ('three', 'CARDINAL'), ('13', 'QUANTITY'), ('-', 'QUANTITY'), ('cis', 'QUANTITY'), ('four', 'CARDINAL'), ('15', 'CARDINAL'), ('13', 'CARDINAL'), ('9', 'CARDINAL'), ('C30', 'PRODUCT'), ('9', 'QUANTITY'), ('-', 'QUANTITY'), ('cis', 'QUANTITY'), ('lambda(max', 'PERSON'), ('400', 'CARDINAL'), ('plasma', 'PERSON'), ('lutein', 'PERSON'), ('cryptoxanthin', 'PERSON'), ('13', 'QUANTITY'), ('-', 'QUANTITY'), ('cis', 'QUANTITY'), ('days', 'DATE'), ('6', 'DATE'), ('-', 'DATE'), ('16', 'DATE'), ('5', 'DATE'), ('d', 'DATE'), ('bas', 'PERSON')]


In [14]:
spacy.explain('PRODUCT') #investigar cada entity-type

'Objects, vehicles, foods, etc. (not services)'

##### Para melhor visualização utilizando NER:

In [15]:
print([(ent.text, ent.label_) for ent in doc.ents])

[('Plasma carotenoid', 'PERSON'), ('36', 'CARDINAL'), ('fed', 'ORG'), ('26%', 'PERCENT'), ('16 mg/', 'QUANTITY'), ('two', 'CARDINAL'), ('15', 'CARDINAL'), ('205', 'CARDINAL'), ('first', 'ORDINAL'), ('second', 'ORDINAL'), ('15', 'CARDINAL'), ('Plasma', 'PERSON'), ('HPLC', 'ORG'), ('Three', 'CARDINAL'), ('zeaxanthin', 'PERSON'), ('cryptoxanthin', 'PERSON'), ('three', 'CARDINAL'), ('13-cis', 'QUANTITY'), ('four', 'CARDINAL'), ('15', 'CARDINAL'), ('13', 'CARDINAL'), ('9', 'CARDINAL'), ('C30', 'PRODUCT'), ('9-cis', 'QUANTITY'), ('lambda(max', 'PERSON'), ('400', 'CARDINAL'), ('plasma lutein', 'PERSON'), ('cryptoxanthin', 'PERSON'), ('13-cis', 'QUANTITY'), ('days 6-16', 'DATE'), ('5 d', 'DATE'), ('bas', 'PERSON')]


In [16]:
print([(ent.text, ent.label_, ent.start_char, ent.end_char) for ent in doc.ents]) #visualizar posição

[('Plasma carotenoid', 'PERSON', 0, 17), ('36', 'CARDINAL', 47, 49), ('fed', 'ORG', 95, 98), ('26%', 'PERCENT', 147, 150), ('16 mg/', 'QUANTITY', 214, 220), ('two', 'CARDINAL', 227, 230), ('15', 'CARDINAL', 231, 233), ('205', 'CARDINAL', 268, 271), ('first', 'ORDINAL', 308, 313), ('second', 'ORDINAL', 321, 327), ('15', 'CARDINAL', 328, 330), ('Plasma', 'PERSON', 373, 379), ('HPLC', 'ORG', 484, 488), ('Three', 'CARDINAL', 490, 495), ('zeaxanthin', 'PERSON', 528, 538), ('cryptoxanthin', 'PERSON', 544, 557), ('three', 'CARDINAL', 560, 565), ('13-cis', 'QUANTITY', 643, 649), ('four', 'CARDINAL', 670, 674), ('15', 'CARDINAL', 706, 708), ('13', 'CARDINAL', 715, 717), ('9', 'CARDINAL', 724, 725), ('C30', 'PRODUCT', 782, 785), ('9-cis', 'QUANTITY', 854, 859), ('lambda(max', 'PERSON', 911, 921), ('400', 'CARDINAL', 925, 928), ('plasma lutein', 'PERSON', 956, 969), ('cryptoxanthin', 'PERSON', 971, 984), ('13-cis', 'QUANTITY', 1002, 1008), ('days 6-16', 'DATE', 1125, 1134), ('5 d', 'DATE', 1216, 

#### Visualização NER e PARSING:

##### Utilizando "render" para visualizar o NER:

In [17]:
displacy.render(doc, style='ent', jupyter=True)

#### Parsing

##### Utilizando o Parsing para melhor visualizar as relações entre as palavras:

In [18]:
displacy.render(doc, style='dep', jupyter=True)

In [19]:
[(t.text, t.dep_, t.head.text) for t in doc] #visualização das relações em texto

[('Plasma', 'compound', 'responses'),
 ('carotenoid', 'compound', 'responses'),
 ('responses', 'nsubjpass', 'determined'),
 ('were', 'auxpass', 'determined'),
 ('determined', 'ROOT', 'determined'),
 ('in', 'prep', 'determined'),
 ('36', 'nummod', 'men'),
 ('healthy', 'amod', 'men'),
 ('men', 'pobj', 'in'),
 ('and', 'cc', 'men'),
 ('women', 'conj', 'men'),
 ('before', 'advmod', 'determined'),
 ('and', 'cc', 'before'),
 ('after', 'conj', 'before'),
 ('being', 'auxpass', 'fed'),
 ('fed', 'pcomp', 'after'),
 ('controlled', 'amod', 'diets'),
 ('diets', 'dobj', 'fed'),
 ('with', 'prep', 'fed'),
 ('a', 'det', 'amount'),
 ('moderate', 'amod', 'amount'),
 ('amount', 'pobj', 'with'),
 ('of', 'prep', 'amount'),
 ('fat', 'pobj', 'of'),
 ('(', 'punct', '%'),
 ('26', 'nummod', '%'),
 ('%', 'appos', 'amount'),
 ('of', 'prep', '%'),
 ('total', 'amod', 'energy'),
 ('energy', 'pobj', 'of'),
 (')', 'punct', '%'),
 ('and', 'cc', '%'),
 ('a', 'det', 'content'),
 ('high', 'amod', 'content'),
 ('carotenoid',