In [1]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

### Sentence Tokenization

In [5]:
text = "Barca won 2-0 against Chelsea at Camp Nou. Messi scored a brace. Valverde's Barca move on"
sents = sent_tokenize(text)
print (sents)

['Barca won 2-0 against Chelsea at Camp Nou.', 'Messi scored a brace.', "Valverde's Barca move on"]


### Word Tokenization

In [6]:
words = [word_tokenize(sent) for sent in sents]
print (words)

[['Barca', 'won', '2-0', 'against', 'Chelsea', 'at', 'Camp', 'Nou', '.'], ['Messi', 'scored', 'a', 'brace', '.'], ['Valverde', "'s", 'Barca', 'move', 'on']]


### Creating Set of Custom Stopwords

In [11]:
from nltk.corpus import stopwords
from string import punctuation
customStopWords = set(stopwords.words('english')+list(punctuation))
print (customStopWords)

set([u'all', u'just', u"don't", u'being', '-', u'over', u'both', u'through', u'yourselves', u'its', u'before', u'o', u'don', '$', u'hadn', u'herself', u'll', u'had', ',', u'should', u'to', u'only', u'won', u'under', u'ours', u'has', u"should've", u"haven't", u'do', u'them', u'his', u'very', u"you've", u'they', u'not', u'during', u'now', u'him', u'nor', '`', u"wasn't", u'd', u'did', '=', u'didn', '^', u'this', u'she', u'each', u'further', u"won't", u'where', u"mustn't", u"isn't", u'few', u'because', u"you'd", u'doing', u'some', u'hasn', u"hasn't", u'are', u'our', u'ourselves', u'out', u'what', u'for', u"needn't", '+', u'below', '/', u're', u'does', u"shouldn't", u'above', u'between', u'mustn', '?', u't', u'be', u'we', u'who', u"mightn't", u"doesn't", u'were', u'here', u'shouldn', u'hers', '[', u"aren't", u'by', '_', u'on', u'about', u'couldn', u'of', u"wouldn't", '&', u'against', '|', u's', u'isn', '(', '{', u'or', u'own', '*', u'into', u'yourself', u'down', u"hadn't", u'mightn', u"coul

### StopWord Removal

In [20]:
#wordsWithoutStopWords = [word for word in word_tokenize(text) if word not in customStopWords]
print (wordsWithoutStopWords)

['Barca', '2-0', 'Chelsea', 'Camp', 'Nou', 'Messi', 'scored', 'brace', 'Valverde', "'s", 'Barca', 'move']


### Identifying Bigram

In [13]:
from nltk.collocations import *
bigram_measure = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(wordsWithoutStopWords)
sorted(finder.ngram_fd.items())

[(("'s", 'Barca'), 1),
 (('2-0', 'Chelsea'), 1),
 (('Barca', '2-0'), 1),
 (('Barca', 'move'), 1),
 (('Camp', 'Nou'), 1),
 (('Chelsea', 'Camp'), 1),
 (('Messi', 'scored'), 1),
 (('Nou', 'Messi'), 1),
 (('Valverde', "'s"), 1),
 (('brace', 'Valverde'), 1),
 (('scored', 'brace'), 1)]

### Perform Stemming

In [14]:
from nltk.stem.lancaster import LancasterStemmer

In [15]:
text2 = "Barca closing in on the closed move of Arthur melo who was close to Chelsea"
st = LancasterStemmer()
stemmedWords = [st.stem(word) for word in word_tokenize(text2)]
print(stemmedWords)

['barc', 'clos', 'in', 'on', 'the', 'clos', 'mov', 'of', 'arth', 'melo', 'who', 'was', 'clos', 'to', 'chelse']


### Tagging the Part of speech

In [18]:
nltk.pos_tag(word_tokenize(text2))

[('Barca', 'NNP'),
 ('closing', 'NN'),
 ('in', 'IN'),
 ('on', 'IN'),
 ('the', 'DT'),
 ('closed', 'JJ'),
 ('move', 'NN'),
 ('of', 'IN'),
 ('Arthur', 'NNP'),
 ('melo', 'NN'),
 ('who', 'WP'),
 ('was', 'VBD'),
 ('close', 'RB'),
 ('to', 'TO'),
 ('Chelsea', 'NNP')]

### Word Sense Disambiguation

In [21]:
from nltk.corpus import wordnet as wn

In [23]:
for ss in wn.synsets('bass'):
    print (ss,ss.definition())

(Synset('bass.n.01'), u'the lowest part of the musical range')
(Synset('bass.n.02'), u'the lowest part in polyphonic music')
(Synset('bass.n.03'), u'an adult male singer with the lowest voice')
(Synset('sea_bass.n.01'), u'the lean flesh of a saltwater fish of the family Serranidae')
(Synset('freshwater_bass.n.01'), u'any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)')
(Synset('bass.n.06'), u'the lowest adult male singing voice')
(Synset('bass.n.07'), u'the member with the lowest range of a family of musical instruments')
(Synset('bass.n.08'), u'nontechnical name for any of numerous edible marine and freshwater spiny-finned fishes')
(Synset('bass.s.01'), u'having or denoting a low vocal or instrumental range')


In [24]:
from nltk.wsd import lesk

In [28]:
sense1 = lesk(word_tokenize('Sing in a lower tone, along with the bass'),'bass')
print (sense1,sense1.definition())

(Synset('bass.n.07'), u'the member with the lowest range of a family of musical instruments')


In [33]:
sense2 = lesk(word_tokenize('This sea water bass was very difficult to catch'), 'bass')
print (sense2, sense2.definition())

(Synset('sea_bass.n.01'), u'the lean flesh of a saltwater fish of the family Serranidae')
