In [1]:
import os
import re
import string
import pandas as pd
import sys
import numpy as np
np.set_printoptions(threshold=sys.maxsize)

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [3]:
stopwords_indonesia = stopwords.words('indonesian')

In [4]:
class Preprocessing(object):
    def __init__(self):
        self.factory = StemmerFactory()
        self.stemmer = self.factory.create_stemmer()
        self.kamus = self.__get_dictionary()

    def __get_dictionary(self):
        df = pd.read_csv('normalisasi.csv', sep=';')
        dictlist = []
        for row in df.values:
            dictlist.append([row[0], row[1]])
        return dictlist

    def __remove_pattern(self, tweet: str, pattern):
        r = re.findall(pattern, tweet)
        for i in r:
            tweet = re.sub(i, '', tweet)
        return tweet

    def __remove_symbol(self, tweet: str):
        tweet = self.__remove_url(tweet)
        # get only alfabet
        pattern = re.compile(r'\b[^\d\W]+\b')
        newwords = []
        for word in pattern.findall(tweet):
            # case folding

            word = word.lower()
            for row in self.kamus:
                key = row[0]
                value = row[1]
                if word == key:
                    word = value
                    break

            word = word.replace("xyz", "")
            newwords.append(word)
        return " ".join(newwords)

    def __remove_url(self, text):
        # Remove additional white spaces
        text = re.sub('[\s]+', ' ', text)
        text = re.sub('[\n]+', ' ', text)
        # remove all url
        text = re.sub(r" ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)", "", text)
        # remove email
        text = re.sub(r"[\w]+@[\w]+\.[c][o][m]", "", text)
        # remove text twit
        text = re.sub(r'((pic\.[^\s]+)|(twitter))', '', text)
        # remove mentions, hashtag and web
        text = re.sub(r"(?:\@|#|http?\://)\S+", "", text)
        # remove url
        text = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))', '', text)
        text = re.sub(r'((https?://[^\s]+))', '', text)
        text = re.sub(r"(pic[^\s]+)|[\w]+\.[c][o][m]", "", text)
        # replace non ascii
        text = re.sub(r'[^\x00-\x7F]+', ' ', text)

        return text

    def __remove_emojis(self, data):
        emoj = re.compile("["
                          u"\U0001F600-\U0001F64F"  # emoticons
                          u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                          u"\U0001F680-\U0001F6FF"  # transport & map symbols
                          u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                          u"\U00002500-\U00002BEF"  # chinese char
                          u"\U00002702-\U000027B0"  
                          u"\U000024C2-\U0001F251"
                          u"\U0001f926-\U0001f937"
                          u"\U00010000-\U0010ffff"
                          u"\u2640-\u2642"
                          u"\u2600-\u2B55"
                          u"\u200d"
                          u"\u23cf"
                          u"\u23e9"
                          u"\u231a"
                          u"\ufe0f"  # dingbats
                          u"\u3030"
                          "]+", re.UNICODE)
        return re.sub(emoj, '', data)
    
    def __concate_duplicate(self, tweet):
        term = "a" + r"{3}"
        rep = re.sub(term, " 3", tweet)
        term = "i" + r"{3}"
        rep = re.sub(term, " 3", rep)
        term = "u" + r"{3}"
        rep = re.sub(term, " 3", rep)
        term = "e" + r"{3}"
        rep = re.sub(term, " 3", rep)
        term = "o" + r"{3}"
        rep = re.sub(term, " 3", rep)

        term = "c" + r"{3}"
        rep = re.sub(term, " 3", rep)
        term = "k" + r"{3}"
        rep = re.sub(term, " 3", rep)
        term = "w" + r"{3}"
        rep = re.sub(term, " 3", rep)
        term = "h" + r"{3}"
        rep = re.sub(term, " 3", rep)

        return rep

    def __clean_tweets(self, tweet: str) -> str:
        # tokenize tweets
        
        tokenizer = TweetTokenizer(
            preserve_case=False, strip_handles=True, reduce_len=True)
        
        tweet_tokens = tokenizer.tokenize(tweet)
        
        tweets_clean = []
        for word in tweet_tokens:
            if (word not in stopwords_indonesia and  # remove stopwords
                    word not in string.punctuation):  # remove punctuation
                tweets_clean.append(word)

        stem_word = self.stemmer.stem(" ".join(tweets_clean))  # stemming word
        return stem_word

    def from_csv(self, file_name):
        raw_data = pd.read_csv(file_name)
#         df = pd.DataFrame(raw_data[['user_account', 'tweet', 'label']])
        df = pd.DataFrame(raw_data[['Handle', 'Text','Emojis','Comments','Likes','Retweets']])

        df['remove_user'] = np.vectorize(self.__remove_pattern)(df['Text'], "(@\\w*)")
        df['remove_symbol'] = df["remove_user"].apply(lambda x: np.vectorize(self.__remove_pattern)(x, "(#\\w*)"))
        df['remove_duplicate_char'] = df['remove_symbol'].apply(self.__concate_duplicate)
        df['remove_emojis'] = df['remove_duplicate_char'].apply(lambda x: self.__remove_emojis(self.__remove_symbol(x)))
        
        df.drop_duplicates(subset="remove_emojis", keep='first', inplace=True)
        
        df['tweet_clean'] = df['remove_emojis'].apply(lambda x: self.__clean_tweets(x))
        df = df.dropna(subset=["tweet_clean"])
        
        for i, row in df.iterrows():
            if row['tweet_clean'] == "":
                df = df.drop(i)
        
        df['ruu'] = df['tweet_clean'].str.contains('rancang undang undang')
        for i, row in df.iterrows():
            if row['ruu'] == False:
                df = df.drop(i)
        df.drop(['ruu'], axis=1, inplace=True)
        
        df['thai_account'] = df['Text'].str.contains('#WhatsHappeninglnThailand')
        for i, row in df.iterrows():
            if row['thai_account'] == True:
                df = df.drop(i)
        df.drop(['thai_account'], axis=1, inplace=True)
        
        df['phil_account'] = df['Text'].str.contains('#WhatsHappeningInPhilippines')
        for i, row in df.iterrows():
            if row['phil_account'] == True:
                df = df.drop(i)
        df.drop(['phil_account'], axis=1, inplace=True)
        
        df['other_account'] = df['Text'].str.contains('#WhatIsHappeningInIndonesia')
        for i, row in df.iterrows():
            if row['other_account'] == True:
                df = df.drop(i)
        df.drop(['other_account'], axis=1, inplace=True)
        
        df['other_account'] = df['Text'].str.contains('#WhatsHappeningInIndonesia')
        for i, row in df.iterrows():
            if row['other_account'] == True:
                df = df.drop(i)
        df.drop(['other_account'], axis=1, inplace=True)
        
        df['other_tag'] = df['Text'].str.contains('#JunkTerrorBill')
        for i, row in df.iterrows():
            if row['other_tag'] == True:
                df = df.drop(i)
        df.drop(['other_tag'], axis=1, inplace=True)
        
        df['other_tag'] = df['Text'].str.contains('Junk Terror Bill')
        for i, row in df.iterrows():
            if row['other_tag'] == True:
                df = df.drop(i)
        df.drop(['other_tag'], axis=1, inplace=True)
        
        for i, row in df.iterrows():
            if pd.isna(row['Retweets']) or pd.isna(row['Likes']):
                df = df.drop(i)
        
        return df

In [5]:
class TfidfFeature(object):
 
    def __init__(self):
        self.tf_dict = {}
        self.idf_dict = {}
        

    def __tokenize(self, tweet):
        tokenizer = TweetTokenizer(
            preserve_case=False, strip_handles=True, reduce_len=True)
        return tokenizer.tokenize(tweet)

    def __calc_TF_Dict(self, document):
        TF_dict = {}
        for term in document:
            if term in TF_dict:
                TF_dict[term] += 1
            else:
                TF_dict[term] = 1
        return TF_dict

    def __calc_count_Dict(self, tfDict):
        count_DF = {}
        for document in tfDict:
            for term in document:
                if term in count_DF:
                    count_DF[term] += 1
                else:
                    count_DF[term] = 1
        return count_DF

    def __calc_IDF_Dict(self, __n_document, __DF):
        IDF_Dict = {}
        for term in __DF:
            IDF_Dict[term] = np.log(__n_document / __DF[term])
        return IDF_Dict

    def __calc_TF_IDF(self, TF):
        TF_IDF_Dict = {}
        for key in TF:
            TF_IDF_Dict[key] = self.tf_dict[key] * self.idf_dict[key]
        return TF_IDF_Dict

    def __calc_TF_IDF_Vec(self, __TF_IDF_Dict):
        wordDict = sorted(self.tf_dict.keys())
        TF_IDF_vector = [0.0] * len(wordDict)

        for i, term in enumerate(wordDict):
            if term in __TF_IDF_Dict:
                TF_IDF_vector[i] = __TF_IDF_Dict[term]
        return TF_IDF_vector

    def set_tf_idf_dict(self, data):
        data['tweet_token'] = data['tweet_clean'].apply(self.__tokenize)
        data["tf_dict"] = data['tweet_token'].apply(self.__calc_TF_Dict)
        self.tf_dict = self.__calc_count_Dict(data["tf_dict"])
        self.idf_dict = self.__calc_IDF_Dict(len(data),  self.tf_dict)

    def calc_tf_idf(self, data):
        data_token = data.apply(self.__tokenize)
        data_tf_dict = data_token.apply(self.__calc_TF_Dict)
        data_tfidf_dict = data_tf_dict.apply(self.__calc_TF_IDF)
        tfidf_vector = [self.__calc_TF_IDF_Vec(row) for row in data_tfidf_dict]
        return tfidf_vector

In [15]:
class NaiveBayes(object):
    def __init__(self, alpha=0.5):
        self.alpha = alpha

    def _predict(self, x_test):
        # Calculate posterior for each class
        posteriors = []
        for idx, _ in enumerate(self._classes):
            prior_c = np.log10(self._priors[idx])
            conditionals_c = self._calc_conditionals(
                self._conditionals[idx, :], x_test)
            posteriors_c = np.sum(conditionals_c) + prior_c
            posteriors.append(posteriors_c)

        return self._classes[np.argmax(posteriors)]

    def _calc_conditionals(self, cls_cond, x_test):
        return np.log(cls_cond) * x_test

    def fit(self, X_train, y_train):
        X_train = np.array(X_train)
        m, n = X_train.shape
        self._classes = np.unique(y_train)
        n_classes = len(self._classes)

        # init: Prior & Conditional
        self._priors = np.zeros(n_classes)
        self._conditionals = np.zeros((n_classes, n))

        # Get Prior and Conditional
        for idx, c in enumerate(self._classes):
            X_train_c = X_train[c == y_train]
            self._priors[idx] = X_train_c.shape[0] / m
            self._conditionals[idx, :] = ((X_train_c.sum(axis=0)) + self.alpha) / (np.sum(X_train_c.sum(axis=0) + self.alpha))
        

    def predict(self, X_test):
        return [self._predict(x_test) for x_test in X_test]

In [7]:
from sklearn.metrics import confusion_matrix

def performance(y_test, y_predict):
    confus = confusion_matrix(y_test, predict)
    tn, fp, fn, tp = confus.ravel()
    accuracy = ((tp + tn)/(tp + tn + fp + fn))*100
    precision = (tp / (tp + fp))*100
    recall = (tp / (tp + fn))*100
    print('accuracy =', accuracy)
    print('precision =', precision)
    print('recall =', recall)

In [None]:
raw_data = 'ruu_cipta_kerja_20102020_terbaru.csv'
preprocessing = Preprocessing()
df = preprocessing.from_csv(raw_data)
df

In [None]:
df['thai_account'] = df['remove_user'].str.contains('#WhatsHappeninglnThailand')
df
# for i, row in df.iterrows():
#     if row['thai_account']:
#         df = df.drop(i)
# df.drop(['thai_account'], axis=1, inplace=True)
# df

# df['phil_account'] = df['Text'].str.contains('#WhatsHappeningInPhilippines')
# for i, row in df.iterrows():
#     if row['phil_account'] == True:
#         df = df.drop(i)
# df.drop(['phil_account'], axis=1, inplace=True)

In [None]:
df

In [None]:
cdf = df
for i, row in cdf.iterrows():
#     print(pd.isna(row['Retweets']))
    if pd.isna(row['Retweets']) or pd.isna(row['Likes']):
        cdf = cdf.drop(i)

In [None]:
cdf

In [None]:
df.to_csv('df_09062021_4.csv')

In [None]:
len(cdf)
cdf.to_csv('df_09062021.csv')
# feature = TfidfFeature()
# feature.set_tf_idf_dict(df)

In [None]:
def get_export():
    raw = pd.read_csv('drive_dataset.csv')
    df = pd.DataFrame(raw[['tweet','label']])
    dictlist = []
    for row in df.values:
        dictlist.append([row[0], row[1]])
    return dictlist

In [None]:
def clean_export(tweet):
    g_export = get_export()
    for row in g_export:
        key = row[0]
        value = row[1]
        if tweet == key:
            return value

In [None]:
df['sentimen'] = df['Text'].apply(lambda x: clean_export(x))

# for i, row in cdf.iterrows():
#     if row['clean_export'] == True:
#         cdf = cdf.drop(i)

# cdf.drop(['clean_export'], axis=1, inplace=True)

In [None]:
df

In [None]:
def akun_asing(akun):
    if akun == '@MoBMaB13':
        return True
    elif akun == '@inyourznx':
        return True
    elif akun == '@jaemin813_th':
        return True
    elif akun == '@cloudypolus':
        return True
    elif akun == '@__TakagiBot':
        return True
    elif akun == '@so_r_u_happynow':
        return True
    elif akun == '@inyourznx':
        return True
    else:
        return False

In [None]:
cdf['akun_asing'] = cdf['Handle'].apply(lambda x: akun_asing(x))

for i, row in cdf.iterrows():
    if row['akun_asing'] == True:
        cdf = cdf.drop(i)
    

cdf.drop('akun_asing', axis=1, inplace=True)

In [None]:
cdf

In [None]:

cdf.to_csv('df_09062021_2.csv')

In [8]:
from sklearn.model_selection import train_test_split

# x_train, x_test, y_train, y_test = train_test_split(df['tweet_clean'], df['label'], test_size=1., shuffle=False)

In [None]:
ft_train = feature.calc_tf_idf(x_train)
ft_test = feature.calc_tf_idf(x_test)

In [None]:
nb = NaiveBayes()
nb.fit(ft_train, y_train)
predict = nb.predict(ft_test)

In [None]:
performance(y_test, predict)

In [None]:
raw_data = pd.read_csv('like_dataset_0.csv')
df = pd.DataFrame(raw_data[['user_account', 'label','tweet','tweet_clean']])
df

In [17]:
for x in range(0, 51):
    idx = str(x)
    raw_data = pd.read_csv('like_dataset_'+idx+'.csv')
    df = pd.DataFrame(raw_data[['user_account', 'label','tweet','tweet_clean']])
    feature = TfidfFeature()
    feature.set_tf_idf_dict(df)
    
    x_train, x_test, y_train, y_test = train_test_split(df['tweet_clean'], df['label'], test_size=0.2, shuffle=False)
    ft_train = feature.calc_tf_idf(x_train)
    ft_test = feature.calc_tf_idf(x_test)
    
    nb = NaiveBayes()
    nb.fit(ft_train, y_train)
    predict = nb.predict(ft_test)
    print('\n====== percobaan ke-'+idx+' =========')
    performance(y_test, predict)
    
    df = df.sample(frac = 1)
    
    new_x = x + 1
    new_idx = str(new_x)
    df.to_csv('like_dataset_'+new_idx+'.csv')
    


accuracy = 66.94677871148458
precision = 80.21978021978022
recall = 64.03508771929825

accuracy = 69.46778711484593
precision = 85.07462686567165
recall = 68.4

accuracy = 70.30812324929971
precision = 83.73205741626795
recall = 70.8502024291498

accuracy = 71.70868347338936
precision = 84.40366972477065
recall = 73.30677290836654

accuracy = 71.98879551820728
precision = 86.09865470852019
recall = 73.5632183908046

accuracy = 71.42857142857143
precision = 83.61344537815127
recall = 75.95419847328245

accuracy = 71.1484593837535
precision = 83.33333333333334
recall = 72.8744939271255

accuracy = 71.42857142857143
precision = 82.58928571428571
recall = 74.59677419354838

accuracy = 69.187675070028
precision = 82.37885462555066
recall = 72.76264591439688

accuracy = 69.187675070028
precision = 85.25345622119815
recall = 70.34220532319392

accuracy = 71.1484593837535
precision = 84.16289592760181
recall = 73.22834645669292

accuracy = 70.86834733893558
precision = 82.71028037383178
recal