# Práticas Computacionais Avançadas - NLP

## Extração de dados para análise do uso de compostos químicos na síntese de perovskitas

In [1]:
# ---------------------------------------------------------------------------- #
#                                     SETUP                                    #
# ---------------------------------------------------------------------------- #

# -------------------------------- Importações ------------------------------- #
import spacy
import pandas as pd

# ------------------------ Carregando dados coletados ------------------------ #
data = pd.read_excel("prvsk_data/1_to_1000.xls")
# --------------!!!!!!!! COLOCAR O RESTO DOS DADOS DEPOIS !!!!!!!!--------------
corpora = [abstract for abstract in data["Abstract"]]

# ----------------- Carregando um modelo estatístico (spaCy) ----------------- #
nlp = spacy.load('en_core_web_sm')

In [2]:
# ---------------------------------------------------------------------------- #
#                           PRÉ-PROCESSAMENTO (spaCy)                          #
# ---------------------------------------------------------------------------- #

# ----------------- "Tokenização" dos corpus obtidos (spaCy) ----------------- #
tokens = []

for corpus in corpora:
    doc = nlp(corpus)
    tokens.extend(token for token in doc)

# ----------------- Removendo stop-words e pontuação (spaCy) ----------------- #
print(f'Stop words: {nlp.Defaults.stop_words}')
print()

tokens = [token for token in tokens if (not token.is_stop and not token.is_punct)]

print(tokens[:20])

Stop words: {'with', 'also', 'should', 'i', 'namely', 'formerly', 'beforehand', 'against', 'using', 'hereby', 'whither', 'how', 'herein', 'about', 'see', 'show', 'eight', 'moreover', 'latter', 'through', 'whereupon', 'ca', 'well', 'done', 'yet', 'due', 'latterly', 'both', 'eleven', 'can', 'his', '‘m', 'of', 'same', 'might', 'mostly', 'everywhere', 'everything', 'could', 'say', 'often', 'whether', 'fifteen', 'has', 'although', 'toward', 'us', 'after', 'became', 'noone', 'nor', 'very', 'back', 'four', 'what', 'alone', 'several', 'else', 'always', 'just', 'an', 'quite', '‘ve', 'nobody', 'most', 'put', 'not', 'top', 'becoming', 'together', 'since', 'around', 'across', 'anyone', 'have', 'her', 'another', 'such', 'become', 'elsewhere', 'it', 'himself', 'whatever', 'themselves', 'your', 'many', 'before', 'may', 'each', 'various', 'being', 'cannot', '’ve', 'one', 'hundred', 'more', '‘s', 'then', 'twelve', '‘re', 'full', 'somehow', 'less', '‘d', 'some', 'serious', 'they', 'used', 'n’t', 'any', 

In [3]:
# --------------- EXTRA: Classificação Morfossintática (spaCy) --------------- #
extra = corpora[0]
doc = nlp(extra)
sent = list(doc.sents)[3]

pos_tags = [(token.text, token.pos_) for token in sent]
for tag in pos_tags: print(tag)

('This', 'DET')
('review', 'NOUN')
('systematically', 'ADV')
('summarizes', 'VERB')
('additive', 'ADJ')
('engineering', 'NOUN')
(',', 'PUNCT')
('solvent', 'ADJ')
('engineering', 'NOUN')
(',', 'PUNCT')
('and', 'CCONJ')
('interface', 'NOUN')
('engineering', 'NOUN')
('methods', 'NOUN')
('to', 'PART')
('promote', 'VERB')
('the', 'DET')
('thin', 'ADJ')
('film', 'NOUN')
('property', 'NOUN')
('for', 'ADP')
('a', 'DET')
('high', 'ADJ')
('PCE', 'NOUN')
('in', 'ADP')
('recent', 'ADJ')
('years', 'NOUN')
('.', 'PUNCT')


In [4]:
# -------------- EXTRA: NER - Reconhecimento de entidade (spaCy) ------------- #
for entity in sent.ents:
    print((entity.text, entity.label_))


# -------------------------- + Visualização gráfica -------------------------- #
spacy.displacy.render(sent, style='ent', jupyter=True)

('PCE', 'ORG')
('recent years', 'DATE')


In [5]:
 # -------------------------- EXTRA: Parsing (spaCy) -------------------------- #
for token in sent:
    print((token.text, token.dep_, token.head.text))

# ------------------------------ + Visualização ------------------------------ #
spacy.displacy.render(sent, style='dep', jupyter=True)

('This', 'det', 'review')
('review', 'nsubj', 'summarizes')
('systematically', 'advmod', 'summarizes')
('summarizes', 'ROOT', 'summarizes')
('additive', 'amod', 'engineering')
('engineering', 'nmod', 'methods')
(',', 'punct', 'engineering')
('solvent', 'amod', 'engineering')
('engineering', 'conj', 'engineering')
(',', 'punct', 'engineering')
('and', 'cc', 'engineering')
('interface', 'compound', 'engineering')
('engineering', 'compound', 'methods')
('methods', 'dobj', 'summarizes')
('to', 'aux', 'promote')
('promote', 'xcomp', 'summarizes')
('the', 'det', 'property')
('thin', 'amod', 'property')
('film', 'compound', 'property')
('property', 'dobj', 'promote')
('for', 'prep', 'promote')
('a', 'det', 'PCE')
('high', 'amod', 'PCE')
('PCE', 'pobj', 'for')
('in', 'prep', 'promote')
('recent', 'amod', 'years')
('years', 'pobj', 'in')
('.', 'punct', 'summarizes')
