# SUMMARY

We study natural language processing in this module using nltk and spacy libraries. We first load and read the file "War and peace.txt", a book by Leo Tolstoy. We break the file into words and punctuations (tokenize) and then extract a small range of data from the file as a list, after removing punctuations. We carry out stemming using lancaster and porter stemmers, and POS tagging. As a final step, detailed POS tagging is carried out using the spacy library.

# 1. Load and read the file

In [1]:
import nltk, re, pprint

In [2]:
f = open('War_And_Peace.txt','rU')

  """Entry point for launching an IPython kernel.


In [3]:
raw = f.read()

In [4]:
type(raw)

str

In [5]:
len(raw)

3227580

In [6]:
(raw[5:100])

' Project Gutenberg EBook of War and Peace, by Leo Tolstoy\n\nThis eBook is for the use of anyone '

# Break into words and punctuations (Tokenizing)

In [7]:
tokens = nltk.word_tokenize(raw)

In [8]:
type(tokens)

list

In [9]:
len(tokens)

674585

In [10]:
tokens

['\ufeff',
 'The',
 'Project',
 'Gutenberg',
 'EBook',
 'of',
 'War',
 'and',
 'Peace',
 ',',
 'by',
 'Leo',
 'Tolstoy',
 'This',
 'eBook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone',
 'anywhere',
 'at',
 'no',
 'cost',
 'and',
 'with',
 'almost',
 'no',
 'restrictions',
 'whatsoever',
 '.',
 'You',
 'may',
 'copy',
 'it',
 ',',
 'give',
 'it',
 'away',
 'or',
 're-use',
 'it',
 'under',
 'the',
 'terms',
 'of',
 'the',
 'Project',
 'Gutenberg',
 'License',
 'included',
 'with',
 'this',
 'eBook',
 'or',
 'online',
 'at',
 'www.gutenberg.org',
 'Title',
 ':',
 'War',
 'and',
 'Peace',
 'Author',
 ':',
 'Leo',
 'Tolstoy',
 'Translators',
 ':',
 'Louise',
 'and',
 'Aylmer',
 'Maude',
 'Posting',
 'Date',
 ':',
 'January',
 '10',
 ',',
 '2009',
 '[',
 'EBook',
 '#',
 '2600',
 ']',
 'Last',
 'Updated',
 ':',
 'December',
 '17',
 ',',
 '2016',
 'Language',
 ':',
 'English',
 'Character',
 'set',
 'encoding',
 ':',
 'UTF-8',
 '***',
 'START',
 'OF',
 'THIS',
 'PROJECT',
 'GUTENBERG',
 '

# 2. Remove punctuations and Extract a small range of data

In [11]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
to = tokenizer.tokenize(raw)
list2 = to[50:100]

In [12]:
list2

['this',
 'eBook',
 'or',
 'online',
 'at',
 'www',
 'gutenberg',
 'org',
 'Title',
 'War',
 'and',
 'Peace',
 'Author',
 'Leo',
 'Tolstoy',
 'Translators',
 'Louise',
 'and',
 'Aylmer',
 'Maude',
 'Posting',
 'Date',
 'January',
 '10',
 '2009',
 'EBook',
 '2600',
 'Last',
 'Updated',
 'December',
 '17',
 '2016',
 'Language',
 'English',
 'Character',
 'set',
 'encoding',
 'UTF',
 '8',
 'START',
 'OF',
 'THIS',
 'PROJECT',
 'GUTENBERG',
 'EBOOK',
 'WAR',
 'AND',
 'PEACE',
 'An',
 'Anonymous']

# 3.Stemming and POS Tagging

In [13]:
porter = nltk.PorterStemmer()

In [14]:
lancaster = nltk.LancasterStemmer()

In [15]:
# PORTER STEMMER

In [16]:
for t in list2: print(porter.stem(t))

thi
ebook
or
onlin
at
www
gutenberg
org
titl
war
and
peac
author
leo
tolstoy
translat
louis
and
aylmer
maud
post
date
januari
10
2009
ebook
2600
last
updat
decemb
17
2016
languag
english
charact
set
encod
utf
8
start
OF
thi
project
gutenberg
ebook
war
and
peac
An
anonym


In [17]:
# LANCASTER STEMMER

In [18]:
for t in list2: print(lancaster.stem(t))

thi
ebook
or
onlin
at
www
gutenberg
org
titl
war
and
peac
auth
leo
tolstoy
transl
lou
and
aylm
maud
post
dat
janu
10
2009
ebook
2600
last
upd
decemb
17
2016
langu
engl
charact
set
encod
utf
8
start
of
thi
project
gutenberg
ebook
war
and
peac
an
anonym


In [19]:
# POS TAGGING

In [20]:
nltk.pos_tag(list2)

[('this', 'DT'),
 ('eBook', 'NN'),
 ('or', 'CC'),
 ('online', 'NN'),
 ('at', 'IN'),
 ('www', 'JJ'),
 ('gutenberg', 'NN'),
 ('org', 'JJ'),
 ('Title', 'NNP'),
 ('War', 'NNP'),
 ('and', 'CC'),
 ('Peace', 'NNP'),
 ('Author', 'NNP'),
 ('Leo', 'NNP'),
 ('Tolstoy', 'NNP'),
 ('Translators', 'NNP'),
 ('Louise', 'NNP'),
 ('and', 'CC'),
 ('Aylmer', 'NNP'),
 ('Maude', 'NNP'),
 ('Posting', 'NNP'),
 ('Date', 'NNP'),
 ('January', 'NNP'),
 ('10', 'CD'),
 ('2009', 'CD'),
 ('EBook', 'NN'),
 ('2600', 'CD'),
 ('Last', 'JJ'),
 ('Updated', 'VBD'),
 ('December', 'NNP'),
 ('17', 'CD'),
 ('2016', 'CD'),
 ('Language', 'NNP'),
 ('English', 'JJ'),
 ('Character', 'NNP'),
 ('set', 'VBD'),
 ('encoding', 'VBG'),
 ('UTF', 'NNP'),
 ('8', 'CD'),
 ('START', 'NNP'),
 ('OF', 'IN'),
 ('THIS', 'NNP'),
 ('PROJECT', 'NNP'),
 ('GUTENBERG', 'NNP'),
 ('EBOOK', 'NNP'),
 ('WAR', 'NNP'),
 ('AND', 'NNP'),
 ('PEACE', 'NNP'),
 ('An', 'DT'),
 ('Anonymous', 'JJ')]

In [21]:
nltk.help.upenn_tagset('NNP')

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...


In [22]:
nltk.help.upenn_tagset('VBD')

VBD: verb, past tense
    dipped pleaded swiped regummed soaked tidied convened halted registered
    cushioned exacted snubbed strode aimed adopted belied figgered
    speculated wore appreciated contemplated ...


In [23]:
nltk.help.upenn_tagset('IN')

IN: preposition or conjunction, subordinating
    astride among uppon whether out inside pro despite on by throughout
    below within for towards near behind atop around if like until below
    next into if beside ...


# 4. Detailed POS Tagging

In [24]:
import spacy



In [25]:
values = list2

with open("file.txt", "w") as output:
    output.write(str(values))

In [26]:
nlp = spacy.load('en_core_web_sm')
g = open('file.txt','rU')
meal = g.read()
doc = nlp(meal)
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_)

[ [ PUNCT -LRB-
' ' PUNCT ``
this this DET DT
' ' PUNCT ''
, , PUNCT ,
' ' PUNCT ``
eBook ebook VERB VB
' ' PUNCT ''
, , PUNCT ,
' ' PUNCT ''
or or CCONJ CC
' ' PUNCT ''
, , PUNCT ,
' ' PUNCT ``
online online ADV RB
' ' PUNCT ''
, , PUNCT ,
' ' PUNCT ''
at at ADP IN
' ' PUNCT ''
, , PUNCT ,
' ' PUNCT ''
www www NOUN NN
' ' PUNCT ''
, , PUNCT ,
' ' PUNCT ''
gutenberg gutenberg X FW
' ' PUNCT ''
, , PUNCT ,
' ' PUNCT ``
org org NOUN NN
' ' PUNCT ''
, , PUNCT ,
' ' PUNCT ``
Title title NOUN NN
' ' PUNCT ''
, , PUNCT ,
' ' PUNCT ``
War war NOUN NN
' ' PUNCT ''
, , PUNCT ,
' ' PUNCT ''
and and CCONJ CC
' ' PUNCT ''
, , PUNCT ,
' ' PUNCT ``
Peace peace NOUN NN
' ' PUNCT ''
, , PUNCT ,
' ' PUNCT ''
Author author NOUN NN
' ' PUNCT ''
, , PUNCT ,
' ' PUNCT ''
Leo leo PROPN NNP
' ' PUNCT ''
, , PUNCT ,
' ' PUNCT ''
Tolstoy tolstoy NOUN NN
' ' PUNCT ''
, , PUNCT ,
' ' PUNCT ``
Translators translator NOUN NNS
' ' PART POS
, , PUNCT ,
' ' PUNCT ''
Louise louise PROPN NNP
' ' PUNCT ''
, , PUNCT ,
' 

  
