# Práticas Computacionais Avançadas - NLP

## Extração de dados para análise do uso de compostos químicos na síntese de perovskitas

In [1]:
# ---------------------------------------------------------------------------- #
#                                     SETUP                                    #
# ---------------------------------------------------------------------------- #

# -------------------------------- Importações ------------------------------- #
import nltk
import pandas as pd

# ------------------------ Carregando dados coletados ------------------------ #
data = pd.read_excel("prvsk_data/1_to_1000.xls")
# --------------!!!!!!!! COLOCAR O RESTO DOS DADOS DEPOIS !!!!!!!!--------------
corpora = [abstract for abstract in data["Abstract"]]



In [2]:
# ---------------------------------------------------------------------------- #
#                           PRÉ-PROCESSAMENTO (NLTK)                           #
# ---------------------------------------------------------------------------- #

# ------------------ "Tokenização" dos corpus obtidos (NLTK) ----------------- #
tokens = []

for corpus in corpora:
    doc = nltk.tokenize.word_tokenize(corpus)
    tokens.extend(doc)

# ------------------ Removendo stop-words e pontuação (NLTK) ----------------- #
stop_words = set(nltk.corpus.stopwords.words('english'))
print(f'Stop words: {stop_words}')
print()

tokens = [token for token in tokens if (not token in stop_words and token.isalnum())]

print(tokens)

Stop words: {'above', 'myself', 'that', 'just', 'an', 'he', 'over', 'hers', 'her', 'out', 'about', 'couldn', 'or', 'to', 'been', 're', 'what', 'will', "you're", 'are', 'nor', "won't", 'so', 'should', "it's", "hadn't", 'not', "didn't", 'be', 'then', 'some', 'have', 'on', 'after', 'up', 'because', 'the', 'himself', 'll', 'which', 'it', 'ourselves', 'they', 've', 'y', 'theirs', 'few', "aren't", 'too', 'through', "weren't", 'had', 'where', 'if', 'ain', 'd', 'those', 'am', 'isn', 'i', 'during', 'is', 'me', 'no', "doesn't", 'than', "you'd", 'same', 'them', 'yourself', 'of', 'once', 'both', 'doing', 'needn', "that'll", 'again', 'o', 'and', 'when', 'whom', 'being', "couldn't", 'for', 'itself', 'with', 'any', 'has', 'do', 'won', 'its', 'this', 'between', 'further', "you've", 'in', "wasn't", 'very', 'such', 'each', 'until', 'hadn', 'shan', "shan't", 'herself', 'ours', "don't", "should've", 'how', 'into', 'him', 'now', 'aren', "mustn't", 'off', 'under', 'only', "shouldn't", 'your', 'why', 'we', '

In [3]:
# ---------------- EXTRA: Classificação Morfossintática (NLTK) --------------- #
extra = corpora[0]
doc = nltk.tokenize.sent_tokenize(extra)
sent = nltk.tokenize.word_tokenize(doc[2])

pos_tags = nltk.pos_tag(sent)
for tag in pos_tags: print(tag)

('However', 'RB')
(',', ',')
('there', 'EX')
('are', 'VBP')
('still', 'RB')
('many', 'JJ')
('defects', 'NNS')
('in', 'IN')
('the', 'DT')
('all-inorganic', 'JJ')
('perovskite', 'NN')
('thin', 'JJ')
('films', 'NNS')
(',', ',')
('and', 'CC')
('it', 'PRP')
('is', 'VBZ')
('difficult', 'JJ')
('to', 'TO')
('obtain', 'VB')
('high', 'JJ')
('power', 'NN')
('conversion', 'NN')
('efficiency', 'NN')
('(', '(')
('PCE', 'NNP')
(')', ')')
('.', '.')


In [5]:
# -------------- EXTRA: NER - Reconhecimento de entidade (spaCy) ------------- #
tree = nltk.ne_chunk(pos_tags)

# -------------------------- + Visualização gráfica -------------------------- #
for subtree in tree:
    if isinstance(subtree, nltk.Tree):
        entity = ' '.join([word for word, tag in subtree.leaves()])
        label = subtree.label()
        print((entity, label))

('PCE', 'ORGANIZATION')
