#### 1. Part of Speech Tagging

**a. Extracting Part of Speech Tags**

Pick an article in the Wikipedia featured articles corpus. Extract the part of speech tags for the first five sentences in the article using NLTK’s pos_tag function. You may need to download the POS tagger data using the following code.

In [29]:
import nltk
#'averaged_perception_tagger' module from nltk is installed from python console
#nltk.download('averaged_perception_tagger')

import pandas as pd
import json
import string
import re
import numpy as np

from nltk.corpus import stopwords
from nltk import FreqDist
from nltk.util import ngrams
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.tokenize import RegexpTokenizer
from collections import Counter

# Create stop words
stop_words = stopwords.words('english')

In [2]:
def readfiles(dirdata, infile):
    """Read the json file(s) as data frame"""
    data = []

    full_filename = "%s/%s" % (dirdata, infile)
        
    with open(full_filename,'r') as fi:
        for line in fi:
            data.append(json.loads(line))

    # Create data frame from the json data
    outdf = pd.DataFrame(data)
    
    return outdf

In [3]:
def cleanString(incomingString):
    """Removes custom characters from a string"""
    newstring = incomingString
    newstring = newstring.replace("!","")
    newstring = newstring.replace("@","")
    newstring = newstring.replace("#","")
    newstring = newstring.replace("$","")
    newstring = newstring.replace("%","")
    newstring = newstring.replace("^","")
    newstring = newstring.replace("&","and")
    newstring = newstring.replace("*","")
    newstring = newstring.replace("(","")
    newstring = newstring.replace(")","")
    newstring = newstring.replace("+","")
    newstring = newstring.replace("=","")
    newstring = newstring.replace("?","")
    newstring = newstring.replace("\'","")
    newstring = newstring.replace("\"","")
    newstring = newstring.replace("{","")
    newstring = newstring.replace("}","")
    newstring = newstring.replace("[","")
    newstring = newstring.replace("]","")
    newstring = newstring.replace("<","")
    newstring = newstring.replace(">","")
    newstring = newstring.replace("~","")
    newstring = newstring.replace("`","")
    newstring = newstring.replace(":","")
    newstring = newstring.replace(";","")
    newstring = newstring.replace("|","")
    newstring = newstring.replace("\\","")
    newstring = newstring.replace("/","") 
    
    return newstring

In [4]:
# Define files and location
dirdata = 'data/wikipedia/featured-articles'

# Using 1 file for this excercise, but have provision fror loading multiple
files = ['featured-articles_010.jsonl']

# Read files, clean and store as data frame
article = readfiles(dirdata, files[0])

# Check data
article.head()

Unnamed: 0,interlinks,section_texts,section_titles,title
0,"{'Christchurch': 'Christchurch', 'Canterbury, ...",[\n\n\nBenjamin Mountfort around 1875\n\n'''Be...,"[Introduction, Early life, New Zealand, Return...",Benjamin Mountfort
1,"{'abstract art': 'abstract', 'the Blitz': 'the...",[\n\n\n\n\n'''Henry Spencer Moore''' (30 July...,"[Introduction, Life, Style, Legacy, Collection...",Henry Moore
2,"{'shrub': 'shrub', 'Banksia': 'Banksia', 'Prot...",[\n\n\nThe '''hairpin banksia''' (''Banksia sp...,"[Introduction, Description, Taxonomy, Distribu...",Banksia spinulosa
3,"{'shrub': 'shrub', 'Proteaceae': 'Proteaceae',...","[\n\n\n\n\n\n'''''Banksia aemula''''', commonl...","[Introduction, Description, Taxonomy, Distribu...",Banksia aemula
4,"{'Noongar': 'Noongar', 'species': 'species', '...","[\n\n\n\n'''''Banksia attenuata''''', commonly...","[Introduction, Description, Taxonomy, Distribu...",Banksia attenuata


In [5]:
# Select article text from the first article and covert to list
txt = article.loc[0,'section_texts']
txt1 = txt[0]
txt1

"\n\n\nBenjamin Mountfort around 1875\n\n'''Benjamin Woolfield Mountfort''' (13 March 1825 – 15 March 1898) was an English emigrant to New Zealand, where he became one of that country's most prominent 19th-century architects. He was instrumental in shaping the city of Christchurch's unique architectural identity and culture, and was appointed the first official Provincial Architect of the developing province of Canterbury. Heavily influenced by the Anglo-Catholic philosophy behind early Victorian architecture, he is credited with importing the Gothic revival style to New Zealand. His Gothic designs constructed in both wood and stone in the province are considered unique to New Zealand. Today, he is considered the founding architect of the province of Canterbury.\n"

In [6]:
# Check if the text has 5 or more articles or not
len(txt1.split('.'))

6

In [7]:
# Extract the first six sentences
sent = txt1.split('.')[0:5]
sent

["\n\n\nBenjamin Mountfort around 1875\n\n'''Benjamin Woolfield Mountfort''' (13 March 1825 – 15 March 1898) was an English emigrant to New Zealand, where he became one of that country's most prominent 19th-century architects",
 " He was instrumental in shaping the city of Christchurch's unique architectural identity and culture, and was appointed the first official Provincial Architect of the developing province of Canterbury",
 ' Heavily influenced by the Anglo-Catholic philosophy behind early Victorian architecture, he is credited with importing the Gothic revival style to New Zealand',
 ' His Gothic designs constructed in both wood and stone in the province are considered unique to New Zealand',
 ' Today, he is considered the founding architect of the province of Canterbury']

In [8]:
def pos(strs):
    """Tokenizing and finding parts of speech"""
    
    # Tokenize the input string
    tokenized = sent_tokenize(strs) 
    
    for i in tokenized: 
      
        # Word tokenizers is used to find the words  
        # and punctuation in a string 
        wordsList = nltk.word_tokenize(i) 

        # removing stop words from wordList 
        wordsList = [w for w in wordsList if not w in stop_words]  

        #  Using a Tagger. Which is part-of-speech  
        # tagger or POS-tagger.  
        tagged = nltk.pos_tag(wordsList) 
  
    # Print formatted outpout
    print(strs)
    print(' ')
    print(tagged) 
    print(' ')

In [9]:
# Remove punctuations and special chatracters from the sentence
lines = [cleanString(l.strip(',').rstrip()) for l in sent]

# Get parts of speech
for i in range(len(lines)):
    pos(lines[i])




Benjamin Mountfort around 1875

Benjamin Woolfield Mountfort 13 March 1825 – 15 March 1898 was an English emigrant to New Zealand, where he became one of that countrys most prominent 19th-century architects
 
[('Benjamin', 'NNP'), ('Mountfort', 'NNP'), ('around', 'IN'), ('1875', 'CD'), ('Benjamin', 'NNP'), ('Woolfield', 'NNP'), ('Mountfort', 'NNP'), ('13', 'CD'), ('March', 'NNP'), ('1825', 'CD'), ('–', 'NNP'), ('15', 'CD'), ('March', 'NNP'), ('1898', 'CD'), ('English', 'NNP'), ('emigrant', 'JJ'), ('New', 'NNP'), ('Zealand', 'NNP'), (',', ','), ('became', 'VBD'), ('one', 'CD'), ('countrys', 'NN'), ('prominent', 'JJ'), ('19th-century', 'JJ'), ('architects', 'NNS')]
 
 He was instrumental in shaping the city of Christchurchs unique architectural identity and culture, and was appointed the first official Provincial Architect of the developing province of Canterbury
 
[('He', 'PRP'), ('instrumental', 'JJ'), ('shaping', 'VBG'), ('city', 'NN'), ('Christchurchs', 'NNP'), ('unique', 'JJ'), (

**b. Selecting Nouns and Verbs**

Repeat the prior step, but only include the nouns and verbs. Below is a list of the noun and verb tags.

NN: noun, common, singular or mass<br>
NNP: noun, proper, singular<br>
NNPS: noun, proper, plural<br>
NNS: noun, common, plural<br>
VB: verb, base form<br>
VBD: verb, past tense<br>
VBG: verb, present participle or gerund<br>
VBN: verb, past participle<br>
VBP: verb, present tense, not 3rd person singular<br>
VBZ: verb, present tense, 3rd person singular

In [10]:
def posNV(strs, allowed_tags):
    """Tokenizing and finding parts of speech: for specific tags/POS"""    
    
    # Tokenize the input string
    tokenized = sent_tokenize(strs) 
    
    for i in tokenized:
        # Word tokenizers is used to find the words  
        # and punctuation in a string 
        wordsList = nltk.word_tokenize(i) 

        # removing stop words from wordList 
        wordsList = [w for w in wordsList if not w in stop_words]  

        #  Using a Tagger. Which is part-of-speech  
        # tagger or POS-tagger.  
        tagged = nltk.pos_tag(wordsList)         
        dt_tags = [t for t in tagged if t[1] in allowed_tags]
                
    # Print formatted outpout
    print(strs)
    print(' ')
    print(dt_tags)
    print(' ')

In [11]:
# Create a tuple of all the allowed tags (Noun and Verb)
allowed_tags = ('NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ')

# Get parts of speech filtered for Verb and Noun
for i in range(len(lines)):
    posNV(lines[i], allowed_tags)




Benjamin Mountfort around 1875

Benjamin Woolfield Mountfort 13 March 1825 – 15 March 1898 was an English emigrant to New Zealand, where he became one of that countrys most prominent 19th-century architects
 
[('Benjamin', 'NNP'), ('Mountfort', 'NNP'), ('Benjamin', 'NNP'), ('Woolfield', 'NNP'), ('Mountfort', 'NNP'), ('March', 'NNP'), ('–', 'NNP'), ('March', 'NNP'), ('English', 'NNP'), ('New', 'NNP'), ('Zealand', 'NNP'), ('became', 'VBD'), ('countrys', 'NN'), ('architects', 'NNS')]
 
 He was instrumental in shaping the city of Christchurchs unique architectural identity and culture, and was appointed the first official Provincial Architect of the developing province of Canterbury
 
[('shaping', 'VBG'), ('city', 'NN'), ('Christchurchs', 'NNP'), ('identity', 'NN'), ('culture', 'NN'), ('appointed', 'VBD'), ('Provincial', 'NNP'), ('Architect', 'NNP'), ('developing', 'VBG'), ('province', 'NN'), ('Canterbury', 'NN')]
 
 Heavily influenced by the Anglo-Catholic philosophy behind early Victo

#### 2. N-Grams

**a. N-Gram Frequencies**

Using the Wikipedia featured article corpus, find and report the 20 most common bigrams and trigrams. You should include the total number of times the bigram/trigram occurred in the corpus as well as the logarithm of the probability of occurring within the corpus. In mathematical terms, if N is the total number of bigrams or trigrams in the corpus and a particular item occurred m times, then the log frequency is log(m/N). It is common practice to report in logarithmic probabilities when dealing with low probability values.

In [16]:
# Create a combined text from the entire corpus, to calculate n grams
corpus = article['section_texts']
alltxt = ''

for i in range(len(corpus)):    
    for j in range(len(corpus[i])):
        alltxt += corpus[i][j]


In [17]:
def get_ngrams(text, n, X):
    """Generate N grams by taking text, x of N gram and return the desired top X desired items"""
    
    # Create counter
    ngram_counts = Counter(ngrams(text.split(), n))   
    
    # Convert to df
    df = pd.DataFrame.from_dict(ngram_counts, orient='index').reset_index().rename(columns={'index':'N Gram', 0:'Occurences'})
    
    # Add log frequency
    df['Log Frequency'] = np.log(df['Occurences']/len(df))
    
    return df.sort_values('Occurences', ascending = 0).head(X)

In [18]:
# Find bi-grams
get_ngrams(alltxt, 2, 20)

Unnamed: 0,N Gram,Occurences,Log Frequency
54,"(of, the)",10709,-3.932636
90,"(in, the)",6210,-4.47756
207,"(to, the)",3620,-5.017247
1103,"(and, the)",2467,-5.400718
1079,"(on, the)",2404,-5.426587
682,"(for, the)",2005,-5.608077
2338,"(*, *)",1856,-5.685297
876,"(from, the)",1826,-5.701593
62,"(by, the)",1711,-5.766643
1486,"(with, the)",1521,-5.884353


In [19]:
# Find tri-grams
get_ngrams(alltxt, 3, 20)

Unnamed: 0,N Gram,Occurences,Log Frequency
2720,"(*, *, *)",1619,-6.281258
226,"(one, of, the)",408,-7.659555
6814,"(as, well, as)",308,-7.940723
2514,"(part, of, the)",280,-8.036033
5270,"(the, end, of)",272,-8.06502
5271,"(end, of, the)",189,-8.429075
24597,"(the, United, States)",180,-8.477865
1185,"(such, as, the)",152,-8.646942
6899,"(in, the, United)",141,-8.722062
11477,"(known, as, the)",141,-8.722062


In [52]:
def get_ngrams_clean(text, n, X):
    cleantxt = preprocess(text)
    n_grams = Counter(ngrams(cleantxt.split(), n))
    #fdist = nltk.FreqDist(n_grams)
    
    return n_grams
 
get_ngrams(alltxt, 2, 2)

Counter({('Benjamin', 'Mountfort'): 5,
         ('Mountfort', 'around'): 1,
         ('around', '1875'): 1,
         ('1875', 'Benjamin'): 1,
         ('Benjamin', 'Woolfield'): 1,
         ('Woolfield', 'Mountfort'): 1,
         ('Mountfort', '13'): 1,
         ('13', 'March'): 3,
         ('March', '1825'): 1,
         ('1825', '–'): 2,
         ('–', '15'): 3,
         ('15', 'March'): 9,
         ('March', '1898'): 1,
         ('1898', 'English'): 1,
         ('English', 'emigrant'): 1,
         ('emigrant', 'New'): 1,
         ('New', 'Zealand'): 86,
         ('Zealand', 'became'): 1,
         ('became', 'one'): 20,
         ('one', 'countrys'): 3,
         ('countrys', 'prominent'): 1,
         ('prominent', '19thcentury'): 1,
         ('19thcentury', 'architects'): 2,
         ('architects', 'He'): 1,
         ('He', 'instrumental'): 1,
         ('instrumental', 'shaping'): 1,
         ('shaping', 'city'): 1,
         ('city', 'Christchurchs'): 1,
         ('Christchurchs', 'uni

In [47]:
def preprocess(text):
    textcln = ''
    textcln = text.translate(str.maketrans('', '', string.punctuation))
    textcln = ' '.join([word for word in textcln.split() if word not in stop_words])
    
    return textcln

In [51]:
alltxt.split()

['Benjamin',
 'Mountfort',
 'around',
 '1875',
 "'''Benjamin",
 'Woolfield',
 "Mountfort'''",
 '(13',
 'March',
 '1825',
 '–',
 '15',
 'March',
 '1898)',
 'was',
 'an',
 'English',
 'emigrant',
 'to',
 'New',
 'Zealand,',
 'where',
 'he',
 'became',
 'one',
 'of',
 'that',
 "country's",
 'most',
 'prominent',
 '19th-century',
 'architects.',
 'He',
 'was',
 'instrumental',
 'in',
 'shaping',
 'the',
 'city',
 'of',
 "Christchurch's",
 'unique',
 'architectural',
 'identity',
 'and',
 'culture,',
 'and',
 'was',
 'appointed',
 'the',
 'first',
 'official',
 'Provincial',
 'Architect',
 'of',
 'the',
 'developing',
 'province',
 'of',
 'Canterbury.',
 'Heavily',
 'influenced',
 'by',
 'the',
 'Anglo-Catholic',
 'philosophy',
 'behind',
 'early',
 'Victorian',
 'architecture,',
 'he',
 'is',
 'credited',
 'with',
 'importing',
 'the',
 'Gothic',
 'revival',
 'style',
 'to',
 'New',
 'Zealand.',
 'His',
 'Gothic',
 'designs',
 'constructed',
 'in',
 'both',
 'wood',
 'and',
 'stone',
 'in'