In [1]:
#import library
import pandas as pd
import numpy as np
import nltk
import Sastrawi
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from nltk.tag import CRFTagger
from collections import Counter
import warnings
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
warnings.filterwarnings("ignore")
import json

In [2]:
tweet = pd.read_csv('train_set.csv', encoding="Latin-1")
tweet

Unnamed: 0,id,sentimen,tweet
0,1,1,oks kak semangat ya kalian kalian
1,2,0,sekarang harus kaya orang bodoh lagi bodoh sangat
2,3,1,"Begitu diumumkan lulus 100%, mereka semua suju..."
3,4,0,[USERNAME] [USERNAME] Katanya Bapak Reformasi ...
4,5,0,macet macetan perut kosong akhirnya mampir dah...
5,6,0,Pernyataan paling mengganggu telinga malam ini...
6,7,1,Masi belum move on dari poto poto ini. Ceritan...
7,8,1,"Dibalik kecemburuan,terselip rasa kasih sayang..."
8,9,1,Kalo udah sayang beneran itu mau dihadapkan sa...
9,10,1,"[USERNAME] Pagi juga mas nyaa, duhh jadi gaena..."


In [3]:
def tokenize_tweet(tweet):
    token = nltk.word_tokenize(tweet)
    tokenized_tweet = ' '.join(token)
    return tokenized_tweet

def normalisasi(tweet): #normalisasi 1 tweet
    normal_tw = tweet.lower() #lowercase
    normal_tw = re.sub('[^\x30-\x39^\x41-\x5A^\x61-\x7A^\s^-]','',normal_tw) #buang punctuation dan karakter selain A-Z, a-z, dan 0-9
    normal_tw = re.sub('tidak-','tidak ',normal_tw)
    normal_tw = re.sub('\d','',normal_tw)
    normal_tw = re.sub('\s+', ' ', normal_tw) # remove extra space
    normal_tw = normal_tw.strip() #trim depan belakang
    normal_regex = re.compile(r"(.)\1{1,}", re.IGNORECASE) #regex huruf yang berulang kaya haiiii (untuk fitur unigram)
    normal_tw = normal_regex.sub(r"\1\1", normal_tw) #buang huruf yang berulang
    return normal_tw

def do_normalization(raw_tweet): #normalisasi banyak tweet
    normalized_tweet = []
    for tw in raw_tweet:
        tokenized_tweet = tokenize_tweet(tw)
        normal_tweet = normalisasi(tokenized_tweet)
        normalized_tweet.append(normal_tweet)
    print("Normalization Done")
    return normalized_tweet

def stemming(tweet): #stemming 1 tweet
    token = nltk.word_tokenize(tweet)
    stem_kalimat = []
    for k in token:
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()
        stem_kata = stemmer.stem(k)
        stem_kalimat.append(stem_kata)
    stem_kalimat_str = ' '.join(stem_kalimat)
    return stem_kalimat_str

def do_stemming(list_tweet): #stemming banyak tweet
    tweet_result = []
    counter_stem = 1
    for tw in list_tweet:
        print(counter_stem)
        print("Sentences to be stemmed : ",tw)
        stemming_result = stemming(tw)
        tweet_result.append(stemming_result)
        counter_stem += 1
    return tweet_result

def remove_stopwords(tweet): #remove stopwords 1 tweet
    stopwords = pd.read_csv("filtered_stopwords.csv", names=['stopword'])['stopword'].tolist()
    special_list = ['username', 'url', 'sensitive-no','askmf']
    token = nltk.word_tokenize(tweet)
    token_afterremoval = []
    for k in token:
        if k not in stopwords and k not in special_list:
            token_afterremoval.append(k)
    str_clean = ' '.join(token_afterremoval)
    return str_clean
def do_remove_stopwords(list_tweet): #remove stopwords banyak tweet
    tweet_result = []
    for tw in list_tweet:
        remove_result = remove_stopwords(tw)
        tweet_result.append(remove_result)
    print("Stopword Removal Done")
    return tweet_result

In [4]:
def preprocess_with_stemming(list_tweet):
    normalized_tweet = do_normalization(list_tweet)
    stopword_removed_tweet = do_remove_stopwords(normalized_tweet)
    stemming_processed_tweet = do_stemming(stopword_removed_tweet)
    return stemming_processed_tweet
def preprocess_without_stemming(list_tweet):
    normalized_tweet = do_normalization(list_tweet)
    stopword_removed_tweet = do_remove_stopwords(normalized_tweet)
    return stopword_removed_tweet

In [5]:
raw_tweet = tweet['tweet']
# stemming_tweet= preprocess_with_stemming(raw_tweet)
no_stemming_tweet = preprocess_without_stemming(raw_tweet)

In [6]:
def create_vocab_count(no_stemming_tweet): #distribusi vocab beserta count nya di dalam corpus train
    vocab = {}
    for twit in no_stemming_tweet:
        for kata in twit.split(" "):
            if kata not in vocab:
                vocab[kata] = 1
            else:
                vocab[kata] += 1
    return vocab

def create_vocab_df(vocab):
    tmp_vocab = []
    for kata in vocab:
        tmp_vocab.append([kata,vocab[kata]])
    vocab_df = pd.DataFrame(tmp_vocab, columns=['vocab','count']).sort_values('count',ascending=False)
    return vocab_df

vocab = create_vocab_count(no_stemming_tweet)
vocab_df = create_vocab_df(vocab)

In [None]:
#secara keseluruhan, kelima method ini melakukan filter sehingga tersisa vocab adjective beserta count nya saja
def EkstraksiPOS(vocab,list_vocab):
    ct = CRFTagger()
    ct.set_model_file("all_indo_man_tag_corpus_model.crf.tagger")
    pos_tag = ct.tag_sents([list_vocab])[0]
    return pos_tag
def tuple_to_list_pos(list_pos_tag):
    pos_tag = []
    for tup in list_pos_tag:
        pos_tag.append(list(tup))
    return pos_tag
def add_count_to_pos(list_pos_tag, vocab):
    for idx in range(len(list_pos_tag)):
        list_pos_tag[idx].append(vocab[list_pos_tag[idx][0]])
    return list_pos_tag
def just_take_jj_tag(list_pos_tag):
    pos_tag = []
    for idx in range(len(list_pos_tag)):
        if(list_pos_tag[idx][1] == 'JJ'):
            pos_tag.append(list_pos_tag[idx])
    print(pos_tag)
    return pos_tag
def save_filtered_pos_tag(list_pos_tag):
    filtered_pos_tag = pd.DataFrame(list_pos_tag, columns=['vocab','pos','count'])
    filtered_pos_tag.to_csv('vocab_adj.csv', index=False)

pos_tag = EkstraksiPOS(vocab, vocab_df['vocab'].tolist())
pos_tag = tuple_to_list_pos(pos_tag)
pos_tag = add_count_to_pos(pos_tag,vocab)
pos_tag = just_take_jj_tag(pos_tag)
save_filtered_pos_tag(pos_tag)

In [7]:
def create_norm_mapping(): #buat dictionary untuk proses normalisasi lebih lanjut
    file = open("normalisasi_mapping.txt", "r")
    line = file.readline()
    mapping = []
    while line:
        tmp = line.split(",")
        tmp[-1] = tmp[-1][:-1]
        mapping.append(tmp)
        line = file.readline()
    file.close()
    print(mapping)
    norm_mapping = {}
    for lst in mapping:
        for kata in lst[:-1]:
            norm_mapping[kata] = lst[-1]
    print(norm_mapping)
    return mapping

In [8]:
def adv_normalisasi(tweet, norm_mapping):
    cleaned_tweet= []
    for tw in tweet:
        new_tweet = []
        for kata in tw.split(" "):
            if kata not in norm_mapping:
                new_tweet.append(kata)
            else:
                new_tweet.append(norm_mapping[kata])
        cleaned_tweet.append(" ".join(new_tweet))
    return cleaned_tweet

In [9]:
norm_map = create_norm_mapping()
# cleaned_stemming = adv_normalisasi(stemming_tweet,norm_map)
cleaned_stemming = pd.read_csv('cleaned_stemming.csv', names=['id','tweet'])['tweet'].tolist()
# cleaned_no_stemming = adv_normalisasi(no_stemming_tweet,norm_map)

[['yang', 'yg', 'yang'], ['dan', 'and', 'n', 'dan'], ['sy', 'saya', 'aq', 'aku', 'gw', 'gua', 'gue', 'saya'], ['itu', 'tu', 'gitu', 'gt', 'itu'], ['kamu', 'qm', 'km', 'mu', 'lu', 'lo', 'elu', 'elo', 'kamu'], ['sama', 'sm', 'sma', 'sama'], ['tidak', 'ga', 'gk', 'gak', 'tdk', 'tidak'], ['bs', 'bisa', 'sabi', 'bisa'], ['mau', 'mw', 'ingin', 'pengen', 'pgn', 'pingin', 'mau'], ['lg', 'lagi', 'again', 'lagi'], ['tapi', 'tp', 'but', 'tapi'], ['dia', 'dy', 'dia'], ['kita', 'kite', 'qt', 'kita'], ['syg', 'sayang', 'sayank', 'sayang'], ['orang', 'org', 'orang'], ['saja', 'aja', 'aj', 'sj', 'saja'], ['jg', 'juga', 'also', 'juga'], ['dari', 'dr', 'from', 'dari'], ['udh', 'sdh', 'sudah', 'udah', 'udeh', 'sudeh', 'sudah'], ['jadi', 'jd', 'jadi'], ['kalau', 'kalo', 'kl', 'if', 'kalau'], ['dengan', 'dgn', 'with', 'dengan'], ['apa', 'ape', 'what', 'apa'], ['buat', 'bwt', 'buat'], ['selamat', 'slmt', 'met', 'slamat', 'selamat'], ['msh', 'masih', 'masih'], ['banget', 'bgt', 'very', 'banget'], ['selalu', 

In [9]:
#membuat meaning dictionary sehingga vocab adjective thesaurus.json yang saling bersinonim di-mapping ke meaning yang sama
def generate_meaning_dict():
    with open('thesaurus.json', 'r') as f:
        thesaurus = json.load(f)
    adj_thesaurus = {}
    for kata in thesaurus:
        if(thesaurus[kata]['tag'] == 'a'):
            adj_thesaurus[kata] = thesaurus[kata]
    meaning_dict = {}
    counter_meaning = 0
    meaning = "m"
    for kata in adj_thesaurus:
        if(kata not in meaning_dict):
            counter_meaning += 1
            meaning = "m" + str(counter_meaning)
            meaning_dict[kata] = meaning
        for sinonim in adj_thesaurus[kata]['sinonim']:
            if(sinonim not in meaning_dict):
                meaning_dict[sinonim] = meaning_dict[kata]
    return meaning_dict

In [10]:
def normalisasi_with_sinonim(cleaned_no_stemming, meaning_dict): #normalisasi tweet berdasarkan meaning dictionary
    tweet_sinonim_normalized = []
    for tweet in cleaned_no_stemming:
        new_tweet = []
        for kata in tweet.split(" "):
            if kata not in meaning_dict:
                new_tweet.append(kata)
            else:
                new_tweet.append(meaning_dict[kata])
        tweet_sinonim_normalized.append(" ".join(new_tweet))
    return tweet_sinonim_normalized

In [11]:
meaning_dict = generate_meaning_dict()
sinonim_normalized = normalisasi_with_sinonim(cleaned_no_stemming,meaning_dict)

In [10]:
tweet['cleaned_stemming'] = cleaned_stemming
tweet['cleaned_no_stemming'] = cleaned_no_stemming
tweet['sinonim_normalized'] = sinonim_normalized

tweet['cleaned_stemming'].to_csv('cleaned_stemming.csv', index=False)
tweet['cleaned_no_stemming'].to_csv('cleaned_no_stemming.csv', index=False)
tweet['sinonim_normalized'].to_csv('sinonim_normalized.csv', index=False)

In [14]:
tweet_test = pd.read_csv('test_set.csv', encoding="Latin-1")
raw_test = tweet_test['tweet']

no_stemming_tweet_test = preprocess_without_stemming(raw_test)
# stemming_tweet_test = preprocess_with_stemming(raw_test)

# cleaned_stemming_test = adv_normalisasi(stemming_tweet_test,norm_map)
cleaned_stemming_test = pd.read_csv('cleaned_stemming_test.csv')['cleaned_tweet_test'].tolist()
cleaned_no_stemming_test = adv_normalisasi(no_stemming_tweet_test,norm_map)
sinonim_normalized_test = normalisasi_with_sinonim(cleaned_no_stemming_test,meaning_dict)

tweet_stemming = cleaned_stemming + cleaned_stemming_test
tweet_no_stemming = cleaned_no_stemming + cleaned_no_stemming_test
tweet_sinonim_normalized = sinonim_normalized + sinonim_normalized_test

In [15]:
def EkstraksiBoW(tweet):
    unigram = CountVectorizer(ngram_range=(1,1), max_features=5000)
    unigram_matrix = unigram.fit_transform(np.array(tweet)).todense()
    nama_fitur = unigram.get_feature_names()
    return unigram_matrix, nama_fitur
def create_result_nb(y_pred):
    id_tweet = 0
    sentiment_label = []
    for pred in y_pred:
        sentiment_label.append([id_tweet,pred])
        id_tweet += 1
    result_df = pd.DataFrame(sentiment_label)
    result_df.to_csv('test_result.csv', index=False)

unigram_stemming, feat_name_stemming = EkstraksiBoW(tweet_stemming)
unigram_no_stemming, feat_name_no_stemming = EkstraksiBoW(tweet_no_stemming)
unigram_sinonim, feat_name_sinonim = EkstraksiBoW(tweet_sinonim_normalized)

unigram_stemming_train = unigram_stemming[:-8000]
unigram_stemming_test = unigram_stemming[-8000:]

unigram_no_stemming_train = unigram_no_stemming[:-8000]
unigram_no_stemming_test = unigram_no_stemming[-8000:]

unigram_sinonim_train = unigram_sinonim[:-8000]
unigram_sinonim_test = unigram_sinonim[-8000:]

clf_stemming = MultinomialNB()
clf_stemming.fit(unigram_stemming_train, tweet['sentimen'])
y_pred_stemming = clf_stemming.predict(unigram_stemming_test)

clf_no_stemming = MultinomialNB()
clf_no_stemming.fit(unigram_no_stemming_train, tweet['sentimen'])
y_pred_no_stemming = clf_no_stemming.predict(unigram_no_stemming_test)

clf_sinonim = MultinomialNB()
clf_sinonim.fit(unigram_sinonim_train, tweet['sentimen'])
y_pred_sinonim = clf_sinonim.predict(unigram_sinonim_test)

In [16]:
create_result_nb(y_pred_stemming)

In [None]:
create_result_nb(y_pred_no_stemming)

In [15]:
create_result_nb(y_pred_sinonim)

In [17]:
def generateSentimenDict(): #membuat kamus sentimen
    pos = pd.read_csv("positive.csv", names=['pos'])
    neg = pd.read_csv("negative.csv", names=['neg'])
    
    list_pos = pos['pos'].tolist()
    list_neg = neg['neg'].tolist()

    sentimen_dict = dict()
    sentimen_dict['pos'] = list_pos
    sentimen_dict['neg'] = list_neg
    
    return sentimen_dict

In [18]:
def EkstraksiSentimenNew(list_tweet,sentimen_dict): #ekstrasi fitur sentimen berdasarkan kamus sentimen
    fitur_sentimen_all = []
    for tweet in list_tweet:
    ##inisiasi value
        emosi = ["positif", "negatif"]
        value = [0,0]
        emosi_value = {}
        for i in range(len(emosi)):
            emosi_value[emosi[i]] = value[i]
        list_kata = tweet.split()
        for k in list_kata:
            if k in sentimen_dict['pos']:
                emosi_value["positif"] += 1
            if k in sentimen_dict['neg']:
                emosi_value["negatif"] += 1
        fitur_sentimen_perkalimat = list(emosi_value.values())
        fitur_sentimen_all.append(fitur_sentimen_perkalimat)
    return fitur_sentimen_all
def create_result(fitur_sentimen_all): #membuat hasil prediksi berdasarkan output fitur sentimen
    sentiment_label = []
    id_tweet = 0
    counter_same = 0
    for skor in fitur_sentimen_all:
        if(skor[0] > skor[1]):
            sentiment_label.append([id_tweet,1])
        else:
            sentiment_label.append([id_tweet,0])
        id_tweet += 1
        if(skor[0] == skor[1]):
            counter_same += 1
    result_df = pd.DataFrame(sentiment_label)
    result_df.to_csv('fitur_sentimen_result.csv', index=False)

In [19]:
sentimen_dict = generateSentimenDict()
fitur_sentimen = EkstraksiSentimenNew(no_stemming_tweet_test, sentimen_dict)
create_result(fitur_sentimen)