# Tugas NLP 1 - Bigram 

## Import All Dependencies

In [1]:
import pandas as pd
import nltk
import math
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/tabul/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Read News File

In [2]:
def read_csv(filename):
    df = pd.read_csv(filename, sep='\t')
    return df['title'].values, df['content'].values, df

titles, contents, _ = read_csv('berita_nasional.csv')
print('Jumlah Berita: ', len(titles))

Jumlah Berita:  420


## Preprocessing

### Remove Unused Token and Adding Start and End Token

In [3]:
titles = ['sssss ' + title[:title.rfind('-')-1] + ' eeeee' for title in titles]
contents = ['sssss ' + content[content.find('-')+1:] + ' eeeee' for content in contents]

### Concatenating Titles and Contents

In [4]:
corpus = [title + " " + content for title, content in zip(titles, contents)]
corpus = ' '.join(corpus)
corpus = corpus.lower()
corpus = corpus.replace('.', ' . ')

### Tokenizing

In [5]:
corpus = nltk.word_tokenize(corpus)
bigrams = list(nltk.bigrams(corpus))

### Term and Bigram Frequencies

In [None]:
def count_frequencies(corpus):
    frequencies = dict(zip(set(corpus), [corpus.count(word) for word in set(corpus)]))
    return frequencies

print('Counting frequencies ...')
term_frequencies = count_frequencies(corpus)
bigram_frequencies = count_frequencies(bigrams)

print(term_frequencies)
print(bigram_frequencies)

Counting frequencies ...


## Bigram Probability
Using smoothing / add one if there is zero value in term frequencies (nominator)

In [None]:
def count_probabilty(current_word, next_word, term_frequencies, bigram_frequencies):
    try:
        return (bigram_frequencies[(current_word, next_word)]) / (term_frequencies[current_word] + len(term_frequencies.keys()))
    except KeyError:
        try:
            return 1 / (term_frequencies[current_word] + len(term_frequencies.keys()))
        except KeyError:
            return 0

In [None]:
count_probabilty('menjaga', 'stabilitas', term_frequencies, bigram_frequencies)

## Get Next Word with Highest Probability

In [None]:
def get_next_word(current_word, term_frequencies, bigram_frequencies):
    prob_word = []
    for word, count in term_frequencies.items():
        try:
            prob_word.append(count_probabilty(current_word, word, term_frequencies, bigram_frequencies))
        except KeyError:
            prob_word.append(0)
    return list(term_frequencies.keys())[prob_word.index(max(prob_word))], max(prob_word)
    

print(get_next_word('sssss', term_frequencies, bigram_frequencies))
print(get_next_word('joko', term_frequencies, bigram_frequencies))
print(get_next_word('presiden', term_frequencies, bigram_frequencies))
print(get_next_word('saya', term_frequencies, bigram_frequencies))
print(get_next_word('akan', term_frequencies, bigram_frequencies))
print(get_next_word('melakukan', term_frequencies, bigram_frequencies))
print(get_next_word('hahaha', term_frequencies, bigram_frequencies))
print(get_next_word('wkwkwk', term_frequencies, bigram_frequencies))
print(get_next_word('i', term_frequencies, bigram_frequencies))
print(get_next_word('gracias', term_frequencies, bigram_frequencies))
print(get_next_word('hola', term_frequencies, bigram_frequencies))

## Count Perplexity from Sentence

In [None]:
def count_perplexity(sentence, term_frequencies, bigram_frequencies):
    perplexity = 0.0
    sentence = sentence.split()
    sentence.insert(0,'sssss')
    sentence.append('eeeee')
    for idx in range(len(sentence)-1):
        try:
            perplexity -= math.log(count_probabilty(sentence[idx], sentence[idx+1], term_frequencies, bigram_frequencies), 2)
        except:
            perplexity -= float('-inf')
#     return perplexity ** (1/4) this line got perplexity value too small, then python round down to 0.0
    return math.pow(2, perplexity / (len(sentence)-1))

In [None]:
print(count_perplexity('presiden joko widodo akan meresmikan bendungan baru', term_frequencies, bigram_frequencies))
print(count_perplexity('saya suka makan nasi merah dan minum susu biru', term_frequencies, bigram_frequencies))
print(count_perplexity('tim badminton indonesia berhasil meraih medali emas terbanyak pada asian games 2018', term_frequencies, bigram_frequencies))
print(count_perplexity('rupiah melemah menjadi diatas 15 ribu', term_frequencies, bigram_frequencies))
print(count_perplexity('saya bingung kalimat apalagi yang akan digunakan untuk evaluasi perplexity', term_frequencies, bigram_frequencies))