# Práticas Computacionais Avançadas - NLP

## Extração de dados para análise do uso de compostos químicos na síntese de perovskitas

In [3]:
# ---------------------------------------------------------------------------- #
#                                     SETUP                                    #
# ---------------------------------------------------------------------------- #

# -------------------------------- Importações ------------------------------- #
import nltk
import pandas as pd

# ------------------------ Carregando dados coletados ------------------------ #
data = pd.read_excel("prvsk_data/1_to_1000.xls")
# --------------!!!!!!!! COLOCAR O RESTO DOS DADOS DEPOIS !!!!!!!!--------------
corpora = [abstract for abstract in data["Abstract"]]

In [4]:
# ---------------------------------------------------------------------------- #
#                           PRÉ-PROCESSAMENTO (NLTK)                           #
# ---------------------------------------------------------------------------- #

# ------------------ "Tokenização" dos corpus obtidos (NLTK) ----------------- #
tokens = []

for corpus in corpora:
    doc = nltk.tokenize.word_tokenize(corpus)
    tokens.extend(doc)

# ------------------ Removendo stop-words e pontuação (NLTK) ----------------- #
stop_words = set(nltk.corpus.stopwords.words('english'))
print(f'Stop words: {stop_words}')
print()

tokens = [token for token in tokens if (not token in stop_words and token.isalnum())]

print(tokens[:20])

Stop words: {'and', 'haven', 'their', 'a', "wouldn't", 'most', 'herself', 'them', 'shan', 'been', "shan't", 'through', 'yourself', 'ain', 'theirs', 'had', 'weren', 'which', "should've", 'ours', 'should', 're', 've', 'be', 't', 'at', 'then', 'but', 'on', 'or', 'any', 'just', 'hers', 'they', 'didn', 'will', 'wasn', 'having', 'because', 'doesn', 'd', 'of', 'while', 'the', 'if', 'between', "mustn't", 'he', 'by', 'against', 'from', 'until', 'when', 'such', 'o', 'these', 'hasn', 'after', 'further', "mightn't", 'we', 'more', 'wouldn', 'his', 'ourselves', 'themselves', "haven't", 'during', 'isn', 'how', 'up', "you've", 'it', "wasn't", 'no', 'm', "didn't", 'couldn', 'under', 'did', "hasn't", 'needn', 'its', 'she', 'all', 'to', 'above', 'can', 'other', "she's", 'nor', "isn't", 'once', 'as', 'those', 'does', 'are', "hadn't", 'below', 'with', 'shouldn', 'him', 'our', 'her', 'has', 'who', 'so', "aren't", 'doing', 'only', 'now', 'won', 'you', 'own', 'myself', 'what', 'for', 'this', 'my', 'that', 'of

In [5]:
# ---------------- EXTRA: Classificação Morfossintática (NLTK) --------------- #
extra = corpora[0]
doc = nltk.tokenize.sent_tokenize(extra)
sent = nltk.tokenize.word_tokenize(doc[2])

pos_tags = nltk.pos_tag(sent)
for tag in pos_tags: print(tag)

('However', 'RB')
(',', ',')
('there', 'EX')
('are', 'VBP')
('still', 'RB')
('many', 'JJ')
('defects', 'NNS')
('in', 'IN')
('the', 'DT')
('all-inorganic', 'JJ')
('perovskite', 'NN')
('thin', 'JJ')
('films', 'NNS')
(',', ',')
('and', 'CC')
('it', 'PRP')
('is', 'VBZ')
('difficult', 'JJ')
('to', 'TO')
('obtain', 'VB')
('high', 'JJ')
('power', 'NN')
('conversion', 'NN')
('efficiency', 'NN')
('(', '(')
('PCE', 'NNP')
(')', ')')
('.', '.')


In [6]:
# -------------- EXTRA: NER - Reconhecimento de entidade (spaCy) ------------- #
tree = nltk.ne_chunk(pos_tags)

# -------------------------- + Visualização gráfica -------------------------- #
for subtree in tree:
    if isinstance(subtree, nltk.Tree):
        entity = ' '.join([word for word, tag in subtree.leaves()])
        label = subtree.label()
        print((entity, label))

('PCE', 'ORGANIZATION')
