## Analysis of Bigrams

1. Take any corpus(article) of your choice and analyze all the bigrams present based on ConditionalFrequencyDist and compute the co-occurence matrix.
2. Visualize the co-occurence matrix.

In [1]:
import nltk
from nltk.corpus import gutenberg
from nltk import bigrams
import pandas as pd
from nltk import word_tokenize, sent_tokenize
from collections import Counter
from nltk.collocations import BigramCollocationFinder
import matplotlib.pyplot as plt
import numpy as np
import plotly.offline as plot
import plotly.graph_objs as go
plot.offline.init_notebook_mode(connected=True)
from nltk.util import ngrams

In [2]:
words = gutenberg.words('shakespeare-macbeth.txt')

## bigram for words

In [3]:
bigram_list = list(bigrams(words))

In [4]:
a = nltk.FreqDist(list(bigrams(words)))
a

FreqDist({('milkes', 'me'): 1,
          ('interprete', 'That'): 1,
          ('truth', 'Are'): 1,
          ('the', 'subtle'): 1,
          ('Torch', '.'): 2,
          ('Souldiership', 'Sey'): 1,
          (',', 'quoth'): 1,
          ('were', 'perfect'): 1,
          ('Stage', ','): 1,
          (',', 'Meeting'): 1,
          ('has', 'bene'): 1,
          ('is', 'said'): 1,
          ('him', 'in'): 2,
          ('thou', 'shalt'): 1,
          ('with', 'feare'): 2,
          ('my', 'Wife'): 2,
          ('bee', ','): 1,
          ('?', 'Mess'): 1,
          ('duties', 'Are'): 1,
          ('Accursed', 'be'): 1,
          ('will', 'performe'): 1,
          ('their', 'Audit'): 1,
          (',', 'Inchanting'): 1,
          ('time', '.'): 3,
          ('with', 'gore'): 1,
          ('selfe', 'haue'): 1,
          ('is', 'supply'): 1,
          ('can', 'remember'): 1,
          ('?', 'Mac'): 2,
          ('will', 'auouch'): 1,
          ('a', 'day'): 2,
          ('thy', 'due'): 1,
     

In [5]:
nltk.ConditionalFreqDist(nltk.FreqDist(bigram_list))

ConditionalFreqDist(nltk.probability.FreqDist,
                    {'wake': FreqDist({'Northumberland': 1, 'each': 1}),
                     'present': FreqDist({',': 1,
                               '.': 1,
                               ':': 1,
                               'Grace': 1,
                               'death': 1,
                               'horror': 1}),
                     'Authoriz': FreqDist({"'": 1}),
                     'sides': FreqDist({',': 1, 'are': 1, 'do': 1, 'of': 1}),
                     'pall': FreqDist({'thee': 1}),
                     'three': FreqDist({'Mile': 1,
                               'Murtherers': 1,
                               'Witches': 1,
                               'eares': 1,
                               'meet': 1,
                               'my': 1,
                               'things': 1,
                               'weyward': 1}),
                     'edge': FreqDist({'I': 1, 'o': 1}),
                    

In [6]:
article1 = open('article1').read()
article1 = sent_tokenize(article1)[0]
article1

'The Supreme Court ruled last week in the case of Fane Lozman vs the City of Riviera Beach, Florida.'

In [7]:
article1_bigrams = list(bigrams(word_tokenize(article1)))
article1_bigrams

[('The', 'Supreme'),
 ('Supreme', 'Court'),
 ('Court', 'ruled'),
 ('ruled', 'last'),
 ('last', 'week'),
 ('week', 'in'),
 ('in', 'the'),
 ('the', 'case'),
 ('case', 'of'),
 ('of', 'Fane'),
 ('Fane', 'Lozman'),
 ('Lozman', 'vs'),
 ('vs', 'the'),
 ('the', 'City'),
 ('City', 'of'),
 ('of', 'Riviera'),
 ('Riviera', 'Beach'),
 ('Beach', ','),
 (',', 'Florida'),
 ('Florida', '.')]

In [8]:
conditional_freq = nltk.ConditionalFreqDist(article1_bigrams)

In [9]:
conditional_freq.tabulate(conditions = ['case'], cumulative = True)

     of 
case  1 


In [10]:
word_table = pd.DataFrame(conditional_freq).fillna(0)
word_table

Unnamed: 0,",",Beach,City,Court,Fane,Florida,Lozman,Riviera,Supreme,The,case,in,last,of,ruled,the,vs,week
",",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Beach,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
City,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Court,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Fane,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Florida,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Lozman,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Riviera,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Supreme,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
word_table.values

array([[ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         1.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,

In [12]:
# word_mat = word_table.as_matrix(word_table)
trace = go.Heatmap(x = word_table.columns.tolist(), y = word_table.columns.tolist(), z = word_table.values)
data = [trace]
plot.iplot(data)

## trigram for words

In [48]:
ngram_list = [(i[0] + ' ' +i[1], i[2]) for i in list(ngrams(word_tokenize(article1), 3))]

In [49]:
ngram_list

[('The Supreme', 'Court'),
 ('Supreme Court', 'ruled'),
 ('Court ruled', 'last'),
 ('ruled last', 'week'),
 ('last week', 'in'),
 ('week in', 'the'),
 ('in the', 'case'),
 ('the case', 'of'),
 ('case of', 'Fane'),
 ('of Fane', 'Lozman'),
 ('Fane Lozman', 'vs'),
 ('Lozman vs', 'the'),
 ('vs the', 'City'),
 ('the City', 'of'),
 ('City of', 'Riviera'),
 ('of Riviera', 'Beach'),
 ('Riviera Beach', ','),
 ('Beach ,', 'Florida'),
 (', Florida', '.')]

In [42]:
ngram_freq_list = nltk.ConditionalFreqDist(ngram_list)
ngram_freq_list

ConditionalFreqDist(nltk.probability.FreqDist,
                    {', Florida': FreqDist({'.': 1}),
                     'Beach ,': FreqDist({'Florida': 1}),
                     'City of': FreqDist({'Riviera': 1}),
                     'Court ruled': FreqDist({'last': 1}),
                     'Fane Lozman': FreqDist({'vs': 1}),
                     'Lozman vs': FreqDist({'the': 1}),
                     'Riviera Beach': FreqDist({',': 1}),
                     'Supreme Court': FreqDist({'ruled': 1}),
                     'The Supreme': FreqDist({'Court': 1}),
                     'case of': FreqDist({'Fane': 1}),
                     'in the': FreqDist({'case': 1}),
                     'last week': FreqDist({'in': 1}),
                     'of Fane': FreqDist({'Lozman': 1}),
                     'of Riviera': FreqDist({'Beach': 1}),
                     'ruled last': FreqDist({'week': 1}),
                     'the City': FreqDist({'of': 1}),
                     'the case': FreqDi

In [60]:
ngram_df = pd.DataFrame.from_dict(ngram_freq_list).fillna(0)

In [52]:
ngram_df

Unnamed: 0,",",.,Beach,City,Court,Fane,Florida,Lozman,Riviera,case,in,last,of,ruled,the,vs,week
", Florida",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Beach ,",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
City of,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Court ruled,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
Fane Lozman,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
Lozman vs,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Riviera Beach,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Supreme Court,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
The Supreme,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
case of,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [61]:
ngram_word_mat = ngram_df.as_matrix()
trace = go.Heatmap(x =ngram_df.columns, y = ngram_df.index, z = ngram_word_mat.tolist())
data = [trace]
plot.iplot(data)

## bigram for pos tags

In [17]:
tags = [i[1] for i in nltk.pos_tag(word_tokenize(article1))]
tags

['DT',
 'NNP',
 'NNP',
 'VBD',
 'JJ',
 'NN',
 'IN',
 'DT',
 'NN',
 'IN',
 'NNP',
 'NNP',
 'VBD',
 'DT',
 'NNP',
 'IN',
 'NNP',
 'NNP',
 ',',
 'NNP',
 '.']

In [18]:
pos_tags_bigram = pd.DataFrame.from_dict(nltk.ConditionalFreqDist(list(bigrams(tags)))).fillna(0)

In [19]:
pos_bigram_mat = pos_tags_bigram.as_matrix()
trace = go.Heatmap(x = pos_tags_bigram.columns, y = pos_tags_bigram.index, z = pos_bigram_mat)
data = [trace]
plot.iplot(data)

## pos tags trigram

In [57]:
pos_trigram = [((i[0]+' '+i[1]), i[2])for i in list(ngrams(tags, 3))]

In [58]:
pos_trigram_df = pd.DataFrame.from_dict(nltk.ConditionalFreqDist(pos_trigram)).fillna(0)

In [59]:
pos_trigram_mat = pos_trigram_df.as_matrix()
trace = go.Heatmap(x = pos_trigram_df.columns, y = pos_trigram_df.index, z = pos_trigram_mat)
data = [trace]
plot.iplot(data)