# Analysis of Bigrams

In [1]:
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
import matplotlib.pyplot as plt
import pandas as pd
from nltk.collocations import BigramCollocationFinder
from nltk.corpus import stopwords
import numpy as np
from nltk.util import ngrams
import plotly.offline as plot
import plotly.graph_objs as go
plot.offline.init_notebook_mode(connected = True)

In [2]:
file = open('TextFile2.txt')
text = file.read()
text = text.lower()
text = word_tokenize(text) 
text = text[:25]

## Take any corpus(article) of your choice and analyze all the bigrams present based on ConditionalFrequencyDist and compute the co-occurence matrix.


In [3]:
cfreq = nltk.ConditionalFreqDist(nltk.bigrams(text))
df = pd.DataFrame.from_dict(cfreq)
df = df.fillna(value= 0)
mat = df.as_matrix()

In [4]:
trace = go.Heatmap(z = mat, x = df.columns ,y = list(df.index))
data = [trace]
plot.iplot(data)

# Extra Work

In [5]:
finder= BigramCollocationFinder.from_words(text)
bigram_measures = nltk.collocations.BigramAssocMeasures()
scored = finder.score_ngrams(bigram_measures.raw_freq)
freq = sorted(bigram for bigram, score in scored)
words = set()
final_dict = {}
mid_list = []
for k,v in freq:
    if k not in words:
        words.add(k)
for word in words:
    for k,v in freq:
        if word == k:
            mid_list.append(v)
    mid_way = (",").join(mid_list)
    final_dict[word] = mid_way
    mid_list = [] 
pd.DataFrame.from_dict(final_dict,orient='index')

Unnamed: 0,0
lozman,vs
of,"fane,riviera"
court,ruled
supreme,court
city,of
.,they
decided,that
in,the
",",florida
that,lozman’s


# Trigram

In [6]:
trigrams=ngrams(text,3)
a=list(trigrams)

In [7]:
tri = nltk.ConditionalFreqDist([('('+ w1 + ' '+ w2 +')',w3) for w1,w2,w3 in a])
tri

ConditionalFreqDist(nltk.probability.FreqDist,
                    {'(, florida)': FreqDist({'.': 1}),
                     '(. they)': FreqDist({'decided': 1}),
                     '(beach ,)': FreqDist({'florida': 1}),
                     '(case of)': FreqDist({'fane': 1}),
                     '(city of)': FreqDist({'riviera': 1}),
                     '(court ruled)': FreqDist({'last': 1}),
                     '(decided that)': FreqDist({'lozman’s': 1}),
                     '(fane lozman)': FreqDist({'vs': 1}),
                     '(florida .)': FreqDist({'they': 1}),
                     '(in the)': FreqDist({'case': 1}),
                     '(last week)': FreqDist({'in': 1}),
                     '(lozman vs)': FreqDist({'the': 1}),
                     '(of fane)': FreqDist({'lozman': 1}),
                     '(of riviera)': FreqDist({'beach': 1}),
                     '(riviera beach)': FreqDist({',': 1}),
                     '(ruled last)': FreqDist({'week': 1}),
     

In [8]:
df1 = pd.DataFrame.from_dict(tri)
df1 = df1.fillna(value= 0)
mat1 = df1.as_matrix()

In [9]:
trace = go.Heatmap(z = mat1, x = df1.columns ,y = list(df1.index))
data = [trace]
plot.iplot(data)

# BiGrams for pos tags

In [10]:
pos_tags = [nltk.pos_tag(text)]
tag = []
for tags in pos_tags:
    for key,value in tags:
        tag.append(value)
tag

['DT',
 'JJ',
 'NN',
 'VBD',
 'JJ',
 'NN',
 'IN',
 'DT',
 'NN',
 'IN',
 'NN',
 'NN',
 'IN',
 'DT',
 'NN',
 'IN',
 'NN',
 'NN',
 ',',
 'NN',
 '.',
 'PRP',
 'VBD',
 'IN',
 'NN']

In [11]:
tag_freq = nltk.ConditionalFreqDist(nltk.bigrams(tag))
df2 = pd.DataFrame.from_dict(tag_freq)
df2 = df2.fillna(value= 0)
mat2 = df2.as_matrix()

In [12]:
tag_freq

ConditionalFreqDist(nltk.probability.FreqDist,
                    {',': FreqDist({'NN': 1}),
                     '.': FreqDist({'PRP': 1}),
                     'DT': FreqDist({'JJ': 1, 'NN': 2}),
                     'IN': FreqDist({'DT': 2, 'NN': 3}),
                     'JJ': FreqDist({'NN': 2}),
                     'NN': FreqDist({',': 1,
                               '.': 1,
                               'IN': 4,
                               'NN': 2,
                               'VBD': 1}),
                     'PRP': FreqDist({'VBD': 1}),
                     'VBD': FreqDist({'IN': 1, 'JJ': 1})})

In [13]:
trace = go.Heatmap(z = mat2, x = df2.columns ,y = list(df2.index))
data = [trace]
plot.iplot(data)