In [None]:
#import library
import numpy as np
import math
import pandas as pd
import re
from collections import Counter
import os

In [None]:
def buildUnigramModel(Text):
    '''
    BUILD UNIGRAM MODEL
    IS : Diberikan input sebuah data berisi text
    FS : Meng-outputkan hasil dari model unigram yang dibuat dalam bentuk dictionary (key: kata; value: probabilitas kemunculan kata tersebut)
    Note : Lakukan proses cleaning dengan menghapus punctuation dan mengubah teks menjadi lower case.
    '''
    #cleaning data
    text = ' {f} {s} ' + Text.astype(str)
    text = ' '.join(text.tolist())
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9{}s]', ' ', text)
    text = re.sub(r'{s}', '<s>', text)
    text = re.sub(r'{f}', '<f>', text)
    text = re.sub(r'[^a-zA-Z0-9<>s]', ' ', text)

    #count amount each word
    uni_count = Counter(text.split())
    sum_words = sum(uni_count.values())

    #count probability unigram and assign to dict
    prob_unigram = {}
    for key,val in uni_count.items():
        prob_unigram[key] = val/sum_words

    return prob_unigram
        

In [None]:
def buildBigramModel(Text):
    '''
    BUILD BIGRAM MODEL
    IS : Diberikan input sebuah data berisi text
    FS : Meng-outputkan hasil dari model bigram yang dibuat dalam bentuk dictionary (key: pasangan kata; value: probabilitas kemunculan pasangan kata tersebut)
    Note : Lakukan proses cleaning dengan menghapus punctuation dan mengubah teks menjadi lower case.
    '''
    
    #cleaning data
    text = ' {f} {s} ' + Text.astype(str)
    text = ' '.join(text.tolist())
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9{}s]', ' ', text)
    text = re.sub(r'{s}', '<s>', text)
    text = re.sub(r'{f}', '<f>', text)
    text = re.sub(r'[^a-zA-Z0-9<>s]', ' ', text)

    #count amount each word
    uni_count = Counter(text.split())

    #count amount of bigram word
    tokens = [token for token in text.split(" ") if token != ""]
    bigrams = zip(*[tokens[i:] for i in range(2)])
    bigram = [" ".join(ngram) for ngram in bigrams]
    bi_count = Counter(bigram)

    #count probability bigram and assign to dict
    prob_bigram = {}
    for key,val in bi_count.items():
        c_w = uni_count.get(key.split()[0])
        prob_bigram[key] = val/c_w


    return prob_bigram

In [None]:
def nextBestWord(bigramModel, currentWord):
    '''
    MENAMPILKAN NEXT BEST WORD
    IS : Menerima input sebuah kata
    FS : Meng-outputkan kata berikutnya yang memiliki probabilitas tertinggi berdasarkan model bigram
    '''
    list_word = {}
    for key,val in bigramModel.items():
        if key.startswith(currentWord+' '):
            list_word[key] = val

    best_word = max(list_word, key=list_word.get)
    return best_word

In [None]:
def nextTenBestWords(bigramModel, currentWord):
    '''
    MENYIMPAN TOP 10 NEXT BEST WORD
    IS : Menerima input sebuah kata
    FS : Menghasilkan list berisi 10 kata berikutnya (beserta probabilitasnya) dengan probabilitas tertinggi berdasarkan model bigram. 
    '''
    
    list_word = []
    for key,val in bigramModel.items():
        if key.startswith(currentWord+' '):
            list_word.append([key.split()[1], val])

    list_word.sort(key = lambda x: x[1], reverse=True)
    return list_word[:10]

In [None]:
def generateSentence(bigramModel, length):
    '''
    GENERATE SENTENCE
    IS : Menerima input model bigram dan panjang kalimat yang ingin di-generate
    FS : Mengembalikan kalimat dengan panjang sesuai inputan
    Note : Generate sentence
    '''
    
    #list of word for start the sentence
    start = {}
    for key,val in bigramModel.items():
        if key.startswith('<s> '):
            start[key] = val   
    sw = []
    for key,val in start.items():
        sw.append([key, val])
    
    #choose first word to start
    rand = np.random.randint(len(sw))
    word = sw[rand][0].split()[1]
    sentence = word+' '
    
    #generate sentence based on word in bigram
    for i in range(length):
        rand = np.random.uniform()  
        if rand < 0.7 :
            list_word = nextTenBestWords(bigramModel, word)
            rand = np.random.randint(len(list_word))
            word = list_word[rand][0]
        else:
            word = nextBestWord(bigramModel, word).split()[1]
            
        sentence += word + ' '
        
    return sentence

In [None]:
if __name__ == '__main__':
    print("TUGAS LANGUAGE MODELING NLP - SFY")
    print("SILAKAN MASUKKAN IDENTITAS ANDA")
    Nama = 'Beladina Elfitri'
    NIM = '1301174046'

    os.system("pause")
    os.system("cls")

    #import dataset
    data = pd.read_csv('text.csv')

    print("TUGAS 1. TAMPILKAN 5 BARIS PERTAMA DARI DATASET")
    print()
    print("HASIL : ")
    print(data.head())

    os.system("pause")
    os.system("cls")

    print("TUGAS 2. BUAT MODEL UNIGRAM")
    print()
    print("HASIL : ")
    print(buildUnigramModel(data['text']))

    os.system("pause")
    os.system("cls")

    print("TUGAS 3. BUAT MODEL BIGRAM")
    print()
    print("HASIL : ")
    bigramModel = buildBigramModel(data['text'])
    print(bigramModel)    

    os.system("pause")
    os.system("cls")

    print("TUGAS 4. MENAMPILKAN NEXT BEST WORD")
    print()
    print("HASIL : ")
    print("of -> ",nextBestWord(bigramModel,"of"))
    print("update -> ",nextBestWord(bigramModel,"update"))
    print("hopes -> ",nextBestWord(bigramModel,"hopes"))

    os.system("pause")
    os.system("cls")

    print("TUGAS 5. TOP 10 BEST NEXT WORD")
    print()
    print("HASIL : ")
    print("of -> ",nextTenBestWords(bigramModel,"of"))
    print("update -> ",nextTenBestWords(bigramModel,"update"))
    print("hopes -> ",nextTenBestWords(bigramModel,"hopes"))

    os.system("pause")
    os.system("cls")

    print("TUGAS 6. GENERATE KALIMAT")
    print()
    n = int(input("Panjang Kalimat : "))
    print("HASIL : ")
    print(generateSentence(bigramModel, n))

    os.system("pause")
    os.system("cls")

    print("SELAMAT", Nama ,"ANDA SUDAH MENYELESAIKAN TUGAS LANGUAGE MODELING NLP-SFY")