## Install Requirements
!conda install -c conda-forge ptable -y

## Required Import

In [1]:
import math
import random
import numpy as np
import pandas as pd
import nltk
from nltk.util import ngrams
nltk.data.path.append('.')
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aftab.alam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Prepare data

In [16]:
DATA_PATH='.'
data_file = 'en_US.twitter.txt'
file = f'{DATA_PATH}/{data_file}'
with open(file, "r") as f:
    data = f.read()
print("Data type:", type(data))
print("Number of letters:", len(data))
print("First 300 letters of the data")
print("-------")
display(data[0:300])
print("-------")

print("Last 300 letters of the data")
print("-------")
display(data[-300:])
print("-------")
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Data type: <class 'str'>
Number of letters: 3335477
First 300 letters of the data
-------


"How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long.\nWhen you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason.\nthey've decided its more fun if I don't.\nSo Tired D; Played Lazer Tag & Ran A "

-------
Last 300 letters of the data
-------


"ust had one a few weeks back....hopefully we will be back soon! wish you the best yo\nColombia is with an 'o'...“: We now ship to 4 countries in South America (fist pump). Please welcome Columbia to the Stunner Family”\n#GutsiestMovesYouCanMake Giving a cat a bath.\nCoffee after 5 was a TERRIBLE idea.\n"

-------


## Define tokens

In [8]:
SOS='<s>' # start of sentence token
EOS='<e>' # end of senetence token
UNK='<unk>' # unknown word token

In [9]:
# create vocabulary using training data.
# replace less occuring words with <unk>
class vocabulary:
    def __init__(self,tokens):
        """
        list of train tokenized sentences 
        """
        self.tokens = tokens
        self.word_counts = None
    
    def count_words(self):
        """
        counts words and create a frequency dict
        """
        counts = {}
        for sentence in self.tokens:
            for word in nltk.word_tokenize(sentence):
                if word in counts:
                    counts[word] += 1
                else:
                    counts[word] = 1
        self.word_counts = counts
    def build_vocab(self,threshold):
        """
        creates a closed vocab with words ocurring less than threshold replaced with <unk>
        """
        closed_vocab = []
        if not self.word_counts:
            self.count_words()
        for word , cnt in self.word_counts.items():
            if cnt >= threshold:
                closed_vocab.append(word)
        self.vocab = closed_vocab
        return self.vocab              
                
        

In [45]:
## Prepare dataset

class DataSet:
    def __init__(self,data_file,):
        with open(data_file, "r") as f:
            self.data = f.read()
        self._preprocess()
    def _preprocess(self):
        self.data = '\n'.join([sentence.lower() for sentence in self.data.split('\n')])
    
    def split_to_data(self,train=.8,dev=.1,test=.1):
        """
        Split data by linebreak "\n"

        Args:
            data: str

        Returns:
            A list of sentences
        """
        sentences = nltk.tokenize.sent_tokenize(self.data)
        print(f'Total Number of sentences: {len(sentences)}')
        random.seed(87)
        random.shuffle(sentences)

        test_size = int(len(sentences) * test)
        self.test_data = sentences[0:test_size]
        train_dev_data = sentences[test_size:]
        dev_size = int(len(sentences) * dev)
        self.dev_data = train_dev_data[0:dev_size]
        self.train_data = train_dev_data[dev_size:]
    def vocab(self,threshold):
        self.closed_vocab = set(vocabulary(tokens=self.train_data).build_vocab(threshold=threshold)+[EOS] + [UNK])
    
    def tokenize_sentences(self, data, n):
        """
        Tokenize sentences into tokens (words)

        Args:
            sentences: List of strings

        Returns:
            List of lists of tokens
        """
        ngram_tokenized_sentences = []
        # Go through each sentence in train data 
        for sentence in data:
            # Convert into a list of words
            # ## add <s> and <e> tokens in data
            tokens = [SOS]*(n-1) + nltk.word_tokenize(sentence) + [EOS]
            tokenized=[]
            for word_tuples in ngrams(tokens,n):
                new_tuple=()
                for word in word_tuples:
                    if word != SOS and word not in self.closed_vocab: # replace less frequent world with UNK
                        word = UNK
                    new_tuple = new_tuple + (word,)
                tokenized.append(new_tuple)
            # append the list of words to the list of lists
            ngram_tokenized_sentences.append(tokenized)

        return ngram_tokenized_sentences
        
    def get_tokenized_data(self,n_grams):
        if not self.train_data:
            self.split_to_data()

        self.ngram_tokenized = self.tokenize_sentences(self.train_data, n_grams)
        self.ngram_minus1_tokenized = self.tokenize_sentences(self.train_data, n_grams-1)
        #self.ngram_minus2_tokenized = self.tokenize_sentences(self.train_data, n_grams-2)
        self.test_tokenized = self.tokenize_sentences(self.test_data,n_grams)
        self.dev_tokenized = self.tokenize_sentences(self.dev_data,n_grams)
        return self.ngram_minus1_tokenized, self.ngram_tokenized, self.test_tokenized ,self.dev_tokenized
    

In [17]:
for word in ngrams([10,20,30,40,100],1):
    print(word)
for word in ngrams([10,20,30,40,100],0):
    print(word)

(10,)
(20,)
(30,)
(40,)
(100,)
(10,)
(20,)
(30,)
(40,)
(100,)


In [46]:
%%time
THESHOLD=2
nGram =2
dataset = DataSet(data_file=file)
dataset.split_to_data()
dataset.vocab(THESHOLD)
closed_vocab = dataset.closed_vocab
ngram_1minus_tokenized,ngram_tokenized, test_data,dev_data = dataset.get_tokenized_data(nGram)

Total Number of sentences: 55661
CPU times: user 16.4 s, sys: 44.4 ms, total: 16.5 s
Wall time: 16.5 s


In [47]:
ngram_1minus_tokenized[0:1]
dev_data[0:1]

[[('hahahaa',),
  ('fabulous',),
  ('design',),
  ('tip',),
  (':',),
  ('your',),
  ('home',),
  ('can',),
  ('have',),
  ('the',),
  ('essence',),
  ('of',),
  ('your',),
  ('favorite',),
  ('look',),
  ('.',),
  ('<e>',)]]

[[('<s>', 'go'),
  ('go', 'thank'),
  ('thank', 'you'),
  ('you', 'ashley'),
  ('ashley', '.'),
  ('.', '<e>')]]

### split dataset in train and set

In [48]:
print("Data are split into {} ngramtrain,{} dev and {} test set".format(
    len(ngram_tokenized), len(dev_data), len(test_data)))
print(f'Length vocab including UNK, SOS, and EOS is {len(closed_vocab)}')
print("First training sample:")
print(ngram_tokenized[0])
      
print("First test sample")
print(test_data[0])

Data are split into 44529 ngramtrain,5566 dev and 5566 test set
Length vocab including UNK, SOS, and EOS is 14785
First training sample:
[('<s>', 'hahahaa'), ('hahahaa', 'fabulous'), ('fabulous', 'design'), ('design', 'tip'), ('tip', ':'), (':', 'your'), ('your', 'home'), ('home', 'can'), ('can', 'have'), ('have', 'the'), ('the', 'essence'), ('essence', 'of'), ('of', 'your'), ('your', 'favorite'), ('favorite', 'look'), ('look', '.'), ('.', '<e>')]
First test sample
[('<s>', 'i'), ('i', 'did'), ('did', "n't"), ("n't", 'send'), ('send', 'yu'), ('yu', 'off'), ('off', 'my'), ('my', 'brand'), ('brand', 'is'), ('is', 'getting'), ('getting', 'bigger'), ('bigger', 'by'), ('by', 'the'), ('the', 'day'), ('day', '!'), ('!', '!'), ('!', '!'), ('!', '<e>')]


In [10]:
# test your code
tokenized_sentences = ['sky is blue .',
                       'leaves are green .',
                       'roses are red .']
vocab = vocabulary(tokenized_sentences)
tmp_closed_vocab = vocab.build_vocab(threshold=2)
print(f"Closed vocabulary:")
print(tmp_closed_vocab)

Closed vocabulary:
['.', 'are']


In [76]:
import os
import pickle
from datetime import datetime

class Model:
    def __init__(self,ngrams=2,n1gramsTrain=ngram_1minus_tokenized, ngramsTrain=ngram_tokenized,vocab=closed_vocab):
        self.vocab = vocab
        self.vocab_size = len(self.vocab)
        self.n_grams = ngrams
        self.train_data_ngram = ngramsTrain
        self.train_data_1ngram = n1gramsTrain

    
    def count_n_grams(self,data):
        """
        Count words after ngrams in training data set
        """
        if not data:
            data = self.train_data_ngram
        nGram_cnt ={}
        for sentence in data:
            for tuples in sentence:
                if nGram_cnt.get(tuples,0):
                    nGram_cnt[tuples] +=1
                else:
                    nGram_cnt[tuples] = 1
        return nGram_cnt

    
    def calculate_ngram_probability(self, ngram, smoothing=1):
        """
        calculate probabilities of given ngram
        ngram  = w1,w2,..wn
        n-1gram = w1,w2...wn-1
        = (count(ngram) + k)/(count(n-1gram) + k*V)
        where V is size of vocab
        """
        count_ngram  = self.nGram_cnt.get(ngram,0)
        nminus1_gram = ngram[:-1]
        count_nminus1_gram = self.n1Gram_cnt.get(nminus1_gram,0)
        probs = (count_ngram + smoothing)/(count_nminus1_gram + smoothing* self.vocab_size)
       
        return probs
    
    def train(self):
        """
        calculate ngram and nminus gram counts
        """
        self.nGram_cnt=self.count_n_grams(data=self.train_data_ngram)
        self.n1Gram_cnt=self.count_n_grams(data=self.train_data_1ngram)
    
    def save(self,path,name,checkpoint):
        model_path = f'{path}/{name}'
        if not os.path.exists(model_path):
            os.mkdir(model_path)
        count_df = {'count_ngram':self.nGram_cnt, 'count_nminus1gram':self.n1Gram_cnt}
        with open(f'{model_path}/{checkpoint}.pkl', 'wb') as fp:
            pickle.dump(count_df, fp, protocol=pickle.HIGHEST_PROTOCOL)
    
    def load(self,path,name,checkpoint):
        model_path = f'{path}/{name}'
        with open(f'{model_path}/{checkpoint}.pkl', 'rb') as fp:
            count_df = pickle.load(fp)
            self.nGram_cnt = count_df['count_ngram']
            self.n1Gram_cnt = count_df['count_nminus1gram']
    
    def predict_nextword(self,ngram):
        """
        Given a ngram find next words and their probabilities
        """
        # n-1 history
        next_hist = ngram[1:]
        probs = {}
        # list of ngrams
        for ngram_tuple in self.nGram_cnt.keys():
            hist = ngram_tuple[:-1]
            word = ngram_tuple[-1]
            if next_hist == hist:
                prob = self.calculate_ngram_probability(ngram_tuple,1)
                probs[word] = prob
        if not probs: # return unknown word if model did not find any thing
            probs = {UNK: 1/self.vocab_size}
        
        return probs
    
        

In [74]:
%%time
# test code
from collections import Counter
model = Model(ngrams=nGram,n1gramsTrain=ngram_1minus_tokenized[0:30],ngramsTrain=ngram_tokenized[0:30],vocab=closed_vocab)
print(f"Ngram train set {len(model.train_data_ngram)} ")
print(f"Ngram train set 1st sentence is  {len(model.train_data_ngram[0])} ")
model.train()
model.save('.','test_model',1)


Ngram train set 30 
Ngram train set 1st sentence is  17 
CPU times: user 1.51 ms, sys: 1.87 ms, total: 3.37 ms
Wall time: 1.99 ms


In [78]:
model = Model(ngrams=nGram,n1gramsTrain=ngram_1minus_tokenized[0:30],ngramsTrain=ngram_tokenized[0:30],vocab=closed_vocab)
model.load('.','test_model',1) 
ngram = ngram_tokenized[0][0]
ngram =('<s>','<s>')
model.calculate_ngram_probability(ngram,smoothing=1)
model.predict_nextword(ngram)

6.763611768684478e-05

{'hahahaa': 0.00013527223537368956,
 'everyone': 0.00013527223537368956,
 'power': 0.00013527223537368956,
 'muah': 0.00013527223537368956,
 'but': 0.00013527223537368956,
 '~': 0.00013527223537368956,
 'hm': 0.00013527223537368956,
 'and': 0.00013527223537368956,
 '<unk>': 0.00013527223537368956,
 'your': 0.00013527223537368956,
 'grr': 0.00013527223537368956,
 'hope': 0.00020290835306053433,
 'ouch': 0.00020290835306053433,
 'although': 0.00013527223537368956,
 'he': 0.00013527223537368956,
 'will': 0.00013527223537368956,
 'please': 0.00013527223537368956,
 ':': 0.00013527223537368956,
 'i': 0.0002705444707473791,
 'ah': 0.00013527223537368956,
 'bring': 0.00013527223537368956,
 'yes': 0.00013527223537368956,
 'going': 0.00013527223537368956,
 'haha': 0.00013527223537368956,
 'if': 0.00013527223537368956,
 'amazing': 0.00013527223537368956}

# dev_data

In [92]:
def perplexity1(sentence):
    """
    ngram tokenize sentence
    """
    N = len(sentence)
    #cross_entropy = − log2 p(x ̄; θ)/N
    px = 1
    for ngram in sentence:
        px *= model.calculate_ngram_probability(ngram,smoothing=1)
    cross_entropy = -1 * np.log2(px)/N
    return 2**cross_entropy

In [84]:
perplexity1(dev_data[1])

13.568827010861229


12151.333381541588

In [87]:
def perplexity2(sentence):
    N = len(sentence)
    #PP = p ** (1/N)
    px = 1
    for ngram in sentence:
        p = model.calculate_ngram_probability(ngram,smoothing=1)
        px *= 1/p
    return px ** (1/N)
perplexity2(dev_data[1])

12151.33338154158

In [88]:
%%time
# train using complete training data
model = Model(ngrams=nGram,n1gramsTrain=ngram_1minus_tokenized,ngramsTrain=ngram_tokenized,vocab=closed_vocab)
print(f"Ngram train set {len(model.train_data_ngram)} ")
print(f"Ngram train set 1st sentence is  {len(model.train_data_ngram[0])} ")
model.train()
model.save('.','bigram_model',1)

Ngram train set 44529 
Ngram train set 1st sentence is  17 
CPU times: user 461 ms, sys: 176 ms, total: 637 ms
Wall time: 2.22 s


In [106]:
# dev data perplexity
dev_perflexity =[]
dev_data[:1]
for sentence in dev_data:
    pp = perplexity2(sentence) 
    sent = ' '.join(wrd[-1] for wrd in sentence)
    dev_perflexity = [*dev_perflexity,[pp,sent]]

    # dev data perplexity
test_perflexity =[]
test_data[:1]
for sentence in test_data:
    pp = perplexity2(sentence) 
    sent = ' '.join(wrd[-1] for wrd in sentence)
    test_perflexity = [*dev_perflexity,[pp,sent]]
# dev data perplexity
train_perflexity =[]
ngram_tokenized[:1]
for sentence in ngram_tokenized:
    pp = perplexity2(sentence) 
    sent = ' '.join(wrd[-1] for wrd in sentence)
    train_perflexity = [*dev_perflexity,[pp,sent]]

[[('<s>', 'go'),
  ('go', 'thank'),
  ('thank', 'you'),
  ('you', 'ashley'),
  ('ashley', '.'),
  ('.', '<e>')]]

[[('<s>', 'i'),
  ('i', 'did'),
  ('did', "n't"),
  ("n't", 'send'),
  ('send', 'yu'),
  ('yu', 'off'),
  ('off', 'my'),
  ('my', 'brand'),
  ('brand', 'is'),
  ('is', 'getting'),
  ('getting', 'bigger'),
  ('bigger', 'by'),
  ('by', 'the'),
  ('the', 'day'),
  ('day', '!'),
  ('!', '!'),
  ('!', '!'),
  ('!', '<e>')]]

[[('<s>', 'hahahaa'),
  ('hahahaa', 'fabulous'),
  ('fabulous', 'design'),
  ('design', 'tip'),
  ('tip', ':'),
  (':', 'your'),
  ('your', 'home'),
  ('home', 'can'),
  ('can', 'have'),
  ('have', 'the'),
  ('the', 'essence'),
  ('essence', 'of'),
  ('of', 'your'),
  ('your', 'favorite'),
  ('favorite', 'look'),
  ('look', '.'),
  ('.', '<e>')]]

In [107]:
print(f"mean perflexity on train data: {np.asarray(np.array(train_perflexity)[:,0],dtype='float64').mean()}")

print(f"mean perflexity on dev data: {np.asarray(np.array(dev_perflexity)[:,0],dtype='float64').mean()}")
print(f"mean perflexity on test data: {np.asarray(np.array(test_perflexity)[:,0],dtype='float64').mean()}")

mean perflexity on train data: 851.7242660032439
mean perflexity on dev data: 851.778108102394
mean perflexity on test data: 851.6859354130299


In [94]:
test_sentence = test_data[1]
s_prob = []
index=[]
N=len(test_words)
for tuples in test_sentence:
    pred, prob  = model.predict_nextword(tuples) # get pro
    if pred == '<e>':
        break
    index = index + [pred]
    s_prob += [prob]
    N +=1
print(' '.join(word for word in index))


i stuff <unk> <unk> asking <unk> that <unk> <unk>


In [96]:
s_prob

[0.000269923746541602,
 0.00013521736190926916,
 6.759040216289287e-05,
 6.759040216289287e-05,
 0.0001351990806462516,
 6.759040216289287e-05,
 0.00013520822065981613,
 6.759040216289287e-05,
 6.759040216289287e-05]

In [100]:
perplexity1(s_prob)

13.297428819787847


10067.576537522376

In [98]:
perplexity2(s_prob)

2.906337915925864

In [118]:
from prettytable import PrettyTable
for label, data in (('Word', words),('Screen Name', screen_names),('Hashtag', hashtags)):

    pt = PrettyTable(field_names=[label, 'Count'])    
    c = Counter(data)    
    [ pt.add_row(kv) for kv in c.most_common()[:10] ]   
    pt.align[label], pt.align['Count'] = 'l', 'r'
    # Set column alignmentprint(pt)

NameError: name 'words' is not defined

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/anaconda3

  added / updated specs:
    - ptable


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ptable-0.9.2               |             py_0          22 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          22 KB

The following NEW packages will be INSTALLED:

  ptable             conda-forge/noarch::ptable-0.9.2-py_0



Downloading and Extracting Packages
ptable-0.9.2         | 22 KB     | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
