# Sample data prep

# New Section

In [0]:
#for demonstration purpose, we take the abstract from a pubmed article 
#(PMID: 30443819) and apply text processing

document = "The curation of neuroscience entities is crucial to ongoing efforts in neuroinformatics and computational neuroscience, such as those being deployed in the context of continuing large-scale brain modelling projects. However, manually sifting through thousands of articles for new information about modelled entities is a painstaking and low-reward task. Text mining can be used to help a curator extract relevant information from this literature in a systematic way. We propose the application of text mining methods for the neuroscience literature. Specifically, two computational neuroscientists annotated a corpus of entities pertinent to neuroscience using active learning techniques to enable swift, targeted annotation. We then trained machine learning models to recognise the entities that have been identified. The entities covered are Neuron Types, Brain Regions, Experimental Values, Units, Ion Currents, Channels, and Conductances and Model organisms. We tested a traditional rule-based approach, a conditional random field and a model using deep learning named entity recognition, finding that the deep learning model was superior. Our final results show that we can detect a range of named entities of interest to the neuroscientist with a macro average precision, recall and F1 score of 0.866, 0.817 and 0.837 respectively. The contributions of this work are as follows: 1) We provide a set of Named Entity Recognition (NER) tools that are capable of detecting neuroscience entities with performance above or similar to prior work. 2) We propose a methodology for training NER tools for neuroscience that requires very little training data to get strong performance. This can be adapted for any sub-domain within neuroscience. 3) We provide a small corpus with annotations for multiple entity types, as well as annotation guidelines to help others reproduce our experiments."

In [0]:
print (len(document))

# Download packages

In [0]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Case folding

In [0]:
document_lower = document.lower()
print(document_lower)

In [0]:
document_upper = document.upper()
print(document_upper)

Note that you can lower/upper at sentence or word level as well.

# Word tokenization

In [0]:
words_split = document_lower.split()
for word in words_split:
  print(word)

In [0]:
from nltk import word_tokenize

words_nltk = word_tokenize(document_lower)

for word in words_nltk:
  print(word)

# Sentence segmentation

In [0]:
sentences_split = document_lower.split('.')

for sentence in sentences_split:
  print(sentence)

In [0]:
from nltk import sent_tokenize
sentences_nltk = sent_tokenize(document_lower)
for sentence in sentences_nltk:
  print(sentence)

From now, we can have a very simple case study on analyzing word frequency of the document.

# Word frequency analysis

In [0]:
word_freq = {}

#compute word frequency on the raw document
words_nltk = word_tokenize(document)

for word in words_nltk:
  if word not in word_freq:
    word_freq[word] = 1
  else:
    word_freq[word] += 1

from operator import itemgetter
freqs = sorted(word_freq.items(), key=itemgetter(1), reverse=True)

for freq in freqs:
  print(freq)

print('total number of tokens', len(word_freq))

In [0]:
word_freq = {}

#compute word frequency on the raw document & case folding
words_nltk = word_tokenize(document_lower)

for word in words_nltk:
  if word not in word_freq:
    word_freq[word] = 1
  else:
    word_freq[word] += 1

from operator import itemgetter
freqs = sorted(word_freq.items(), key=itemgetter(1), reverse=True)

for freq in freqs:
  print(freq)

print('total number of tokens', len(word_freq))

# Removing puntuations

In [0]:
from string import punctuation

print(punctuation)

In [0]:
word_freq = {}

#compute word frequency on the raw document & case folding & removing punctuations
words_nltk = word_tokenize(document_lower)
for word in words_nltk:
  if word not in punctuation:
    if word not in word_freq:
      word_freq[word] = 1
    else:
      word_freq[word] += 1

from operator import itemgetter
freqs = sorted(word_freq.items(), key=itemgetter(1), reverse=True)

for freq in freqs:
  print(freq)

print('total number of tokens', len(word_freq))

# Removing stopwords

In [0]:
from nltk.corpus import stopwords
stopword_list = set(stopwords.words('english'))
print('number of stopword list', len(stopword_list))

In [0]:
word_freq = {}

#compute word frequency on the raw document & case folding & removing punctuations & removing stopwords
words_nltk = word_tokenize(document_lower)
for word in words_nltk:
  if word not in punctuation and word not in stopword_list:
    if word not in word_freq:
      word_freq[word] = 1
    else:
      word_freq[word] += 1

from operator import itemgetter
freqs = sorted(word_freq.items(), key=itemgetter(1), reverse=True)

for freq in freqs:
  print(freq)

print('total number of tokens', len(word_freq))

# Stemming and lemmatization

In [0]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

example_words = ['demonstrate', 'demonstration', 'demonstrating']

for example_word in example_words:
  print(example_word, 'is stemmed to', stemmer.stem(example_word))

In [0]:
word_freq = {}

#compute word frequency on the raw document 
#& case folding & removing punctuations & removing stopwords
#stemming
words_nltk = word_tokenize(document_lower)
for word in words_nltk:
  if word not in punctuation and word not in stopword_list:
    word = stemmer.stem(word)
    if word not in word_freq:
      word_freq[word] = 1
    else:
      word_freq[word] += 1

from operator import itemgetter
freqs = sorted(word_freq.items(), key=itemgetter(1), reverse=True)

for freq in freqs:
  print(freq)

print('total number of tokens', len(word_freq))