In [1]:
from collections import Counter
from itertools import tee, islice
import pandas as pd
import numpy as np

In [2]:
tagset = {'NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS', 'WDT', 'IN', 'DET', 'DT', 'PRP', 'PRP$', 'WP', 'WP$',
              'RB', 'RBR', 'RBS', 'EX', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'MD'}

In [3]:
blogs_tags = pd.read_pickle('../data/blog_pos_tags.pkl')

In [4]:
n = blogs_tags.count

In [5]:
tag_dict = {'NN': 147426, 'RBS': 519, 'NNPS': 973, 'VBD': 28665, 'VBZ': 17264, 'MD': 6556, 'WP$': 95, 'EX': 1456,
                'UH': 854, 'IN': 46408, 'VB': 41655, 'JJR': 3063, 'JJS': 2555, 'PRP': 19567, 'WDT': 2653, 'JJ': 76267,
                'VBP': 25877, 'NNS': 47102, 'VBN': 20522, 'DT': 22888, 'RB': 47863, 'WP': 3115, 'VBG': 25179,
                'NNP': 69860, 'DET': 0, 'RBR': 1539, 'PRP$': 7978}
f1 = ['NN', 'NNPS', 'VBD', 'VBZ', 'MD', 'EX', 'IN', 'VB', 'JJR', 'JJS', 'PRP', 'WDT', 'JJ', 'VBP', 'NNS', 'VBN',
          'DT', 'RB', 'WP', 'VBG', 'NNP', 'RBR', 'PRP$']

In [6]:
sp1 = [sp for sp in f1]
f = [f1]

In [7]:
def ngrams(lst, n):
    tlst = lst
    while True:
        a, b = tee(tlst)
        l = tuple(islice(a, n))
        if len(l) == n:
            yield l
            next(b)
            tlst = b
        else:
            break

In [8]:
ngram_counter = Counter(dict.fromkeys(set(f1), 0))

In [9]:
# for fair scp i need probabilities of ngrams of lengths between 1 and MAX_LENGTH (7 in our case)
# for this, I need the counts of each ngram of lengths as above and the total count of ngrams of that length
# By this both the numerator and the denominator will be taken care of. If I have the above,
# the problem will simply boil down to finding the right count from the dictionary

# blogs_tags is a sequence of pos tags found for each document
# for each document, store count for each distinct n-gram and every possible sequence for n-grams greater than 1

In [10]:
tags = blogs_tags.POS.values
MAX_LEN = 7

In [40]:
for tag_seq in tags:
    tag_list = tag_seq.split()
    tag_list = [tag for tag in tag_list if tag in set(f1)]
    tag_len = len(tag_list)
    for i in range(1, MAX_LEN + 1):
        ngram_counter += Counter(ngrams(tag_list, i))

In [41]:
import pickle
with open('../data/corpora_seq_counts', 'wb') as pkldump:
    pickle.dump(ngram_counter, pkldump)

In [42]:
# from the above dict, find total counts of all uni-grams, bi-grams, so on until MAX_LEN-grams
# this means summing up all counts from the dictionary where key length is 1, 2, and so on

In [49]:
list(ngram_counter.keys())[129]

('VBD', 'DT')

In [52]:
ngram_counter[('VBD', 'DT', 'JJ')]

2716

In [53]:
gram_counts = np.zeros(MAX_LEN)

In [59]:
for n_gram in ngram_counter.keys():
    gram_counts[len(n_gram) - 1] += ngram_counter[n_gram]

In [60]:
gram_counts

array([3757772., 2499230., 1253985., 1304275., 1486943., 1803153.,
       2105964.])

In [61]:
gram_probs = Counter(dict.fromkeys(ngram_counter, 0))

In [64]:
for ngram in gram_probs.keys():
    gram_probs[ngram] = ngram_counter[ngram] / gram_counts[len(ngram) - 1]

In [81]:
seq = ('DT','NN','JJ')
gram_probs[seq]

0.0014370187841162374

In [83]:
numerator = (len(seq) - 1) * gram_probs[seq] ** 2

In [84]:
numerator

4.130045971805818e-06

In [86]:
denominator = 0
for i in range(len(seq) - 1):
    denominator += gram_probs[seq[:i + 1]] * gram_probs[seq[i + 1:]]

In [87]:
numerator / denominator

0.0008627387961255472

In [49]:
def fairSCP(seq):
    print("Sequence", seq)
    print("Sequence length", len(seq))
    print("Probability for", seq, gram_probs[seq])
    numerator = (len(seq) - 1) * gram_probs[seq] ** 2
    print("Numerator", numerator)
    denominator = 0
    print("Initial denominator", denominator)
    for i in range(1, len(seq)):
        print("Sequence substrings", seq[:i], seq[i:])
        print("Sequence substring probabilities", gram_probs[seq[:i]], gram_probs[seq[i:]])
        denominator += gram_probs[seq[:i]] * gram_probs[seq[i:]]
        print()
        print("Denominator for", i, denominator)
    return numerator / denominator

In [56]:
def count_corpora_pos_sequences(blogs_tags):
    ngram_counter = Counter(dict.fromkeys(set(f1), 0))
    print("Initial Counter", ngram_counter)
#     tags = blogs_tags.POS.values
    tags = blogs_tags
    print("Tags", tags)
    for tag_seq in tags.split():
        print("Tag Seq", tag_seq)
        tag_list = tag_seq.split()
        print("Tag List", tag_list)
        tag_list = [tag for tag in tag_list if tag in set(f1)]
        print("Updated Tag List", tag_list)
        tag_len = len(tag_list)
        for i in range(1, 4):
            ngram_counter += Counter(ngrams(tag_list, i))
            print("Counter for length", i, ngram_counter)
    gram_counts = np.zeros(MAX_LEN)
    print("Initial Gram Counts", gram_counts)
    for n_gram in ngram_counter.keys():
        gram_counts[len(n_gram) - 1] += ngram_counter[n_gram]
        print("Gram Count for ", n_gram, gram_counts[len(n_gram) - 1])
    gram_probs = Counter(dict.fromkeys(ngram_counter, 0))
    print("Initial Gram Probability", gram_probs)
    for ngram in gram_probs.keys():
        gram_probs[ngram] = ngram_counter[ngram] / gram_counts[len(ngram) - 1]
        print("Gram probability for", ngram, gram_probs[ngram], "\n")
    return gram_probs

In [57]:
data = ['DT NN VBZ VBN IN NN',
        'RB PRP VBP IN VBG DT NN']

bt = pd.Series(name='POS', data=data)

In [60]:
f1 = ['DT', 'NN', 'VBZ', 'VBN', 'IN', 'RB', 'PRP', 'VBP', 'VBG']
MAX_LEN = 3

In [16]:
t = pd.DataFrame(bt)

In [129]:
gp = count_corpora_pos_sequences(t, pseq, 3)

Initial Counter Counter({'VBP': 0, 'IN': 0, 'DT': 0, 'NN': 0, 'PRP': 0, 'VBG': 0, 'VBZ': 0, 'VBN': 0, 'RB': 0})
Tags ['DT NN VBZ VBN IN NN' 'RB PRP VBP IN VBG DT NN']
Tag Seq DT NN VBZ VBN IN NN
Tag List ['DT', 'NN', 'VBZ', 'VBN', 'IN', 'NN']
Updated Tag List ['DT', 'NN', 'VBZ', 'VBN', 'IN', 'NN']
Counter for length 1 Counter({('NN',): 2, ('DT',): 1, ('VBZ',): 1, ('VBN',): 1, ('IN',): 1})
Counter for length 2 Counter({('NN',): 2, ('DT',): 1, ('VBZ',): 1, ('VBN',): 1, ('IN',): 1, ('DT', 'NN'): 1, ('NN', 'VBZ'): 1, ('VBZ', 'VBN'): 1, ('VBN', 'IN'): 1, ('IN', 'NN'): 1})
Counter for length 3 Counter({('NN',): 2, ('DT',): 1, ('VBZ',): 1, ('VBN',): 1, ('IN',): 1, ('DT', 'NN'): 1, ('NN', 'VBZ'): 1, ('VBZ', 'VBN'): 1, ('VBN', 'IN'): 1, ('IN', 'NN'): 1, ('DT', 'NN', 'VBZ'): 1, ('NN', 'VBZ', 'VBN'): 1, ('VBZ', 'VBN', 'IN'): 1, ('VBN', 'IN', 'NN'): 1})
Tag Seq RB PRP VBP IN VBG DT NN
Tag List ['RB', 'PRP', 'VBP', 'IN', 'VBG', 'DT', 'NN']
Updated Tag List ['RB', 'PRP', 'VBP', 'IN', 'VBG', 'DT', 'N

In [132]:
fairSCP(('IN', 'VBG', 'DT'))

Sequence ('IN', 'VBG', 'DT')
Sequence length 3
Probability for ('IN', 'VBG', 'DT') 0.0011307950254588373
Numerator 2.557394779204905e-06
Initial denominator 0
Sequence substrings ('IN',) ('VBG', 'DT')
Sequence substring probabilities 0.12440057566025826 0.004700647799522253

Denominator for 1 0.0005847632922366946
Sequence substrings ('IN', 'VBG') ('DT',)
Sequence substring probabilities 0.004743861109221641 0.10340329322800851

Denominator for 2 0.0010752941535464855


0.0023783211047602405