In [1]:
import pandas as pd
import numpy as np
from pickle import dump, load
from collections import Counter
from itertools import tee, islice

In [6]:
with open('../data/NGramCounts.pkl', 'rb') as pkldump:
    ngram_counts = load(pkldump)
with open('../data/POS_Tags.pkl', 'rb') as pkldump:
    pos_tags = load(pkldump)

In [7]:
def candidate_gen(fk, tagset):
    ck = set()
    for post_ngram in fk:
        for tag in tagset:
            suffixed_tag = post_ngram + ' ' + tag
            ck.add(suffixed_tag)

    return ck

In [117]:
def fair_scp(seq, ngram_counter):
    seq = tuple(seq.split())
    numerator = (len(seq) - 1) * ngram_counter[seq] ** 2
    denominator = 0
    for i in range(1, len(seq)):
        denominator += ngram_counter[seq[:i]] * ngram_counter[seq[i:]]
#     if len(seq) > 3:
#         print('Seq = ', seq)
#         print('Numerator = ', numerator)
#         print('Denominator = ', denominator)
#         print()
    return numerator / denominator

In [9]:
tagset = {'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 
          'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 
          'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'}

In [11]:
tag_counter = Counter(dict.fromkeys(tagset, 0))
for tags in pos_tags:
    ts = set(tags)
    tag_counter += Counter(dict.fromkeys(ts, 1))

In [13]:
del tag_counter[':']
del tag_counter['.']
del tag_counter[',']
del tag_counter["''"]
del tag_counter['``']
del tag_counter['$']

In [118]:
f1 = []
minsup = 0.3
minadh = 0

for tag in tag_counter.keys():
    support = tag_counter[tag] / len(pos_tags)
    if support >= minsup:
        f1.append(tag)

In [119]:
sp1 = [sp for sp in f1]
f = [f1]
spk = [] + sp1
print(f)

[['VBN', 'CC', 'NNP', 'JJ', 'NN', 'RB', 'WRB', 'DT', 'TO', 'UH', 'VBG', 'IN', 'CD', 'VBP', 'RBR', 'WP', 'NNS', 'PRP', 'VB', 'VBD', 'VBZ', 'MD', 'PRP$', 'WDT', 'JJS', 'RP', 'EX', 'NFP', 'JJR']]


In [120]:
max_length = 7
max_minadh = 0

In [121]:
for k in range(1, max_length + 1):
    print("Working for length ", k)
    ck = candidate_gen(f[k - 1], tagset)
    ck_dict = dict.fromkeys(ck, 0)
    
    for tags in pos_tags:
        joined_tags = ' '.join(tags)
        for seq in ck:
            if seq in joined_tags:
                ck_dict[seq] += 1

    fk = []
    for seq in ck_dict.keys():
        support = ck_dict[seq] / len(pos_tags)
        if support >= minsup:
            fk.append(seq)

    f.append(fk)
    
    kmax_minadh = 0
    for f12 in fk:
        fscp = fair_scp(f12, ngram_counts)
        if fscp > max_minadh:
            max_minadh = fscp
        if fscp > kmax_minadh:
            kmax_minadh = fscp
        if fscp >= minadh:
            spk.append(f12)

Working for length  2
Working for length  3
Working for length  4
Working for length  5
Working for length  6
Working for length  7
Working for length  8


In [123]:
len(pos_tags)

3212