In [2]:
import os
import re
import string
import pandas as pd
import sys
import numpy as np
np.set_printoptions(threshold=sys.maxsize)

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from textblob import TextBlob

In [None]:
import googletrans
from googletrans import *

In [None]:
stopwords_indonesia = stopwords.words('indonesian')

In [None]:
class Preprocessing(object):
    def __init__(self):
        self.factory = StemmerFactory()
        self.stemmer = self.factory.create_stemmer()
        self.kamus = self.__get_dictionary()

    def __get_dictionary(self):
        df = pd.read_csv('normalisasi.csv', sep=';')
        dictlist = []
        for row in df.values:
            dictlist.append([row[0], row[1]])
        return dictlist

    def __remove_pattern(self, tweet: str, pattern):
        r = re.findall(pattern, tweet)
        for i in r:
            tweet = re.sub(i, '', tweet)
        return tweet

    def __remove_symbol(self, tweet: str):
        tweet = self.__remove_url(tweet)
        # get only alfabet
        pattern = re.compile(r'\b[^\d\W]+\b')
        newwords = []
        for word in pattern.findall(tweet):
            # case folding

            word = word.lower()
            for row in self.kamus:
                key = row[0]
                value = row[1]
                if word == key:
                    word = value
                    break

            word = word.replace("xyz", "")
            newwords.append(word)
        return " ".join(newwords)

    def __remove_url(self, text):
        # Remove additional white spaces
        text = re.sub('[\s]+', ' ', text)
        text = re.sub('[\n]+', ' ', text)
        # remove all url
        text = re.sub(r" ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)", "", text)
        # remove email
        text = re.sub(r"[\w]+@[\w]+\.[c][o][m]", "", text)
        # remove text twit
        text = re.sub(r'((pic\.[^\s]+)|(twitter))', '', text)
        # remove mentions, hashtag and web
        text = re.sub(r"(?:\@|#|http?\://)\S+", "", text)
        # remove url
        text = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))', '', text)
        text = re.sub(r'((https?://[^\s]+))', '', text)
        text = re.sub(r"(pic[^\s]+)|[\w]+\.[c][o][m]", "", text)
        # replace non ascii
        text = re.sub(r'[^\x00-\x7F]+', ' ', text)

        return text

    def __remove_emojis(self, data):
        emoj = re.compile("["
                          u"\U0001F600-\U0001F64F"  # emoticons
                          u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                          u"\U0001F680-\U0001F6FF"  # transport & map symbols
                          u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                          u"\U00002500-\U00002BEF"  # chinese char
                          u"\U00002702-\U000027B0"  
                          u"\U000024C2-\U0001F251"
                          u"\U0001f926-\U0001f937"
                          u"\U00010000-\U0010ffff"
                          u"\u2640-\u2642"
                          u"\u2600-\u2B55"
                          u"\u200d"
                          u"\u23cf"
                          u"\u23e9"
                          u"\u231a"
                          u"\ufe0f"  # dingbats
                          u"\u3030"
                          "]+", re.UNICODE)
        return re.sub(emoj, '', data)
    
    def __concate_duplicate(self, tweet):
        term = "a" + r"{3}"
        rep = re.sub(term, " 3", tweet)
        term = "i" + r"{3}"
        rep = re.sub(term, " 3", rep)
        term = "u" + r"{3}"
        rep = re.sub(term, " 3", rep)
        term = "e" + r"{3}"
        rep = re.sub(term, " 3", rep)
        term = "o" + r"{3}"
        rep = re.sub(term, " 3", rep)

        term = "c" + r"{3}"
        rep = re.sub(term, " 3", rep)
        term = "k" + r"{3}"
        rep = re.sub(term, " 3", rep)
        term = "w" + r"{3}"
        rep = re.sub(term, " 3", rep)
        term = "h" + r"{3}"
        rep = re.sub(term, " 3", rep)

        return rep

    def __clean_tweets(self, tweet: str) -> str:
        # tokenize tweets
        
        tokenizer = TweetTokenizer(
            preserve_case=False, strip_handles=True, reduce_len=True)
        
        tweet_tokens = tokenizer.tokenize(tweet)
        
        tweets_clean = []
        for word in tweet_tokens:
            if (word not in stopwords_indonesia and  # remove stopwords
                    word not in string.punctuation):  # remove punctuation
                tweets_clean.append(word)

        stem_word = self.stemmer.stem(" ".join(tweets_clean))  # stemming word
        return stem_word

    def from_csv(self, file_name):
        raw_data = pd.read_csv(file_name)
#         df = pd.DataFrame(raw_data[['user_account', 'tweet', 'label']])
        df = pd.DataFrame(raw_data[['Handle', 'Text']])

        df['remove_user'] = np.vectorize(self.__remove_pattern)(df['Text'], "(@\\w*)")
        df['remove_symbol'] = df["remove_user"].apply(lambda x: np.vectorize(self.__remove_pattern)(x, "(#\\w*)"))
        df['remove_duplicate_char'] = df['remove_symbol'].apply(self.__concate_duplicate)
        df['remove_emojis'] = df['remove_duplicate_char'].apply(lambda x: self.__remove_emojis(self.__remove_symbol(x)))
        
        df.drop_duplicates(subset="remove_emojis", keep='first', inplace=True)
        
        df['tweet_clean'] = df['remove_emojis'].apply(lambda x: self.__clean_tweets(x))
        df = df.dropna(subset=["tweet_clean"])
        for i, row in df.iterrows():
            if row['tweet_clean'] == "":
                df = df.drop(i)
        return df

In [None]:
raw_data = pd.read_csv('sentimen_dataset.csv')
preprocessing = Preprocessing()
dataset = preprocessing.from_csv(raw_data)
# dataset.to_csv('new_dataset.csv')

In [69]:
raw = pd.read_csv('export_dataset.csv')
df = pd.DataFrame(raw[['user_account', 'tweet', 'clean_tweet', 'sentimen']])

In [70]:
df['ruu'] = df['clean_tweet'].str.contains('rancang undang undang')

In [72]:
for i, row in df.iterrows():
            if row['ruu'] == False:
                df = df.drop(i)

In [73]:
df

Unnamed: 0,user_account,tweet,clean_tweet,sentimen,ruu
0,@RahmaFathiyyah,Ketua DPR: Draft RUU Cipta Kerja Hanya Dicek A...,ketua dewan wakil rakyat draf rancang undang u...,0,True
1,@Hendria04991579,Jurus kilat ala sistem kebut skripsi layaknya ...,jurus kilat ala sistem kebut skripsi layak mah...,0,True
2,@anis34216465,Sudah jatuh tertimpa tangga. Bebas Covid-19 ja...,jatuh timpa tangga bebas covid panggang api pa...,0,True
4,@EkoFirman15,Mahasiswa cerdas . Pelajaran i dulu RUU CIPTA ...,mahasiswa cerdas ajar rancang undang undang ci...,1,True
5,@beereciel,"tapi kalau topik ruu cipta kerja,,, SEBERAPA B...",topik rancang undang undang cipta kerja mampus,0,True
...,...,...,...,...,...
876,@RosidinBrawija4,ampuun dah gmna mo baca RUU cipta kerja yg set...,ampun baca rancang undang undang cipta kerja t...,0,True
878,@RizalAriyadi3,RUU Cipta Kerja Putus Rantai Mafia Birokrasi !...,rancang undang undang cipta kerja putus rantai...,1,True
880,@sukmariyati,rangorang di lab pada rame bahas RUU cipta ker...,orang orang lab ramai bahas rancang undang und...,1,True
881,@mamaciaaa,"Jadi, naskah yg dirapihkan terkait kesalahan p...",naskah rapi kait salah tulis salah ketik tanda...,1,True


In [None]:
# dataset.drop_duplicates(subset="tweet_clean", keep='first', inplace=True)
# dataset.to_csv('news_dataset.csv')

In [None]:
def sentiment(text):
    analisis = TextBlob(text)
    if analisis.sentiment.polarity >= 0.0:
        return 1
    else:
        return 0
    

In [None]:
raw_data1 = pd.read_csv('bhs_data_1_2000.csv')
raw_data2 = pd.read_csv('bhs_data_2001_4000.csv')
raw_data3 = pd.read_csv('bhs_data_4001_6000.csv')
raw_data4 = pd.read_csv('bhs_data_6001_7629.csv')

df1 = pd.DataFrame(raw_data1[['Handle', 'Text', 'tweet_clean','bhs_ing']])
df2 = pd.DataFrame(raw_data2[['Handle', 'Text', 'tweet_clean','bhs_ing']])
df3 = pd.DataFrame(raw_data3[['Handle', 'Text', 'tweet_clean','bhs_ing']])
df4 = pd.DataFrame(raw_data4[['Handle', 'Text', 'tweet_clean','bhs_ing']])

df_1 = pd.merge(df1,df2, how='outer')
df_2 = pd.merge(df3,df4, how='outer')
df = pd.merge(df_1,df_2, how='outer')

In [None]:
df['sentiment'] = df['bhs_ing'].apply(lambda tweet: sentiment(tweet))

In [None]:
df.head()
df.to_csv('sentiment_dataset.csv')

In [None]:
raw_data = pd.read_csv('new_dataset.csv')
df = pd.DataFrame(raw_data[['Handle', 'Text', 'tweet_clean']])
df['sentiment'] = df['tweet_clean'].apply(
    lambda tweet: sentiment(tweet))
df.to_csv('sentiment_dataset.csv')

In [None]:
def translate(text):
    translator = Translator()
    lang_en = translator.translate(text, src='id', dest='en')
    return lang_en.text

In [None]:
dataset['bhs_inggris'] = dataset['tweet_clean'].apply(lambda tweet: translate(tweet))

In [None]:
dataset.to_csv('new_dataset.csv')

In [None]:
def sentiment(text):
    analisis = TextBlob(text)
    an = analisis.translate(from_lang='id',to='en')
    if an.sentiment.polarity >= 0.0:
        return 1
    else:
        return 0

In [None]:
# dataset.drop(['remove_user', 'remove_symbol', 'remove_duplicate_char', 'remove_emojis', 'tweet_clean', 'ruu'], axis=1, inplace=True)

In [None]:
# dataset.to_csv('sentimen_dataset.csv')

In [None]:
class TfidfFeature(object):
 
    def __init__(self):
        self.tf_dict = {}
        self.idf_dict = {}
        

    def __tokenize(self, tweet):
        tokenizer = TweetTokenizer(
            preserve_case=False, strip_handles=True, reduce_len=True)
        return tokenizer.tokenize(tweet)

    def __calc_TF_Dict(self, document):
        TF_dict = {}
        for term in document:
            if term in TF_dict:
                TF_dict[term] += 1
            else:
                TF_dict[term] = 1
        return TF_dict

    def __calc_count_Dict(self, tfDict):
        count_DF = {}
        for document in tfDict:
            for term in document:
                if term in count_DF:
                    count_DF[term] += 1
                else:
                    count_DF[term] = 1
        return count_DF

    def __calc_IDF_Dict(self, __n_document, __DF):
        IDF_Dict = {}
        for term in __DF:
            IDF_Dict[term] = np.log(__n_document / __DF[term])
        return IDF_Dict

    def __calc_TF_IDF(self, TF):
        TF_IDF_Dict = {}
        for key in TF:
            TF_IDF_Dict[key] = self.tf_dict[key] * self.idf_dict[key]
        return TF_IDF_Dict

    def __calc_TF_IDF_Vec(self, __TF_IDF_Dict):
        wordDict = sorted(self.tf_dict.keys())
        TF_IDF_vector = [0.0] * len(wordDict)

        for i, term in enumerate(wordDict):
            if term in __TF_IDF_Dict:
                TF_IDF_vector[i] = __TF_IDF_Dict[term]
        return TF_IDF_vector

    def set_tf_idf_dict(self, data):
        data['tweet_token'] = data['tweet_clean'].apply(self.__tokenize)
        data["tf_dict"] = data['tweet_token'].apply(self.__calc_TF_Dict)
        self.tf_dict = self.__calc_count_Dict(data["tf_dict"])
        self.idf_dict = self.__calc_IDF_Dict(len(data),  self.tf_dict)

    def calc_tf_idf(self, data):
        data_token = data.apply(self.__tokenize)
        data_tf_dict = data_token.apply(self.__calc_TF_Dict)
        data_tfidf_dict = data_tf_dict.apply(self.__calc_TF_IDF)
        tfidf_vector = [self.__calc_TF_IDF_Vec(row) for row in data_tfidf_dict]
        return tfidf_vector


In [None]:
feature = TfidfFeature()
feature.set_tf_idf_dict(dataset)

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(dataset['tweet_clean'], dataset['sentiment'], test_size=0.2, shuffle=False)

In [None]:
# dataset.head()

In [None]:
ft_train = feature.calc_tf_idf(x_train)
ft_test = feature.calc_tf_idf(x_test)

In [None]:
class NaiveBayes(object):
    def __init__(self, alpha=0.5):
        self.alpha = alpha

    def _predict(self, x_test):
        # Calculate posterior for each class
        posteriors = []
        for idx, _ in enumerate(self._classes):
            prior_c = np.log(self._priors[idx])
            conditionals_c = self._calc_conditionals(
                self._conditionals[idx, :], x_test)
            posteriors_c = np.sum(conditionals_c) + prior_c
            posteriors.append(posteriors_c)

        return self._classes[np.argmax(posteriors)]

    def _calc_conditionals(self, cls_cond, x_test):
        return np.log(cls_cond) * x_test

    def fit(self, X_train, y_train):
        X_train = np.array(X_train)
        m, n = X_train.shape
        self._classes = np.unique(y_train)
        n_classes = len(self._classes)

        # init: Prior & Conditional
        self._priors = np.zeros(n_classes)
        self._conditionals = np.zeros((n_classes, n))

        # Get Prior and Conditional
        for idx, c in enumerate(self._classes):
            X_train_c = X_train[c == y_train]
            self._priors[idx] = X_train_c.shape[0] / m
            self._conditionals[idx, :] = ((X_train_c.sum(axis=0)) + self.alpha) / (np.sum(X_train_c.sum(axis=0) + self.alpha))
        

    def predict(self, X_test):
        return [self._predict(x_test) for x_test in X_test]

In [None]:
nb = NaiveBayes()
nb.fit(ft_train, y_train)
predict = nb.predict(ft_test)

In [None]:
from sklearn.metrics import confusion_matrix

confus = confusion_matrix(y_test, predict)

In [None]:
tn, fp, fn, tp = confus.ravel()
accuracy = ((tp + tn)/(tp + tn + fp + fn))*100
precision = (tp / (tp + fp))*100
recall = (tp / (tp + fn))*100

In [None]:
print('accuracy =', accuracy)
print('precision =', precision)
print('recall =', recall)

In [None]:
raw_data = pd.read_csv('sentimen_dataset.csv')
df = pd.DataFrame(raw_data1[['Handle', 'Text', 'tweet_clean']])

In [81]:
def berita(akun):
    if akun == '@media_maju':
        return True
    elif akun == '@SINDOnews':
        return True
    elif akun == '@Yahoo_ID':
        return True
    elif akun == '@CNNIDdaily':
        return True
    elif akun == '@hariankompas':
        return True
    elif akun == '@MAJALAH_GATRA':
        return True
    elif akun == '@kompasiana':
        return True
    elif akun == '@tvOneNews':
        return True
    elif akun == '@KompasData':
        return True
    elif akun == '@kompascom':
        return True
    elif akun == '@BORNEONEWS':
        return True
    elif akun == '@jpnncom':
        return True
    elif akun == '@jawapos':
        return True
    elif akun == '@KalbarOnline':
        return True
    elif akun == '@SPN_OR_ID':
        return True
    elif akun == '@detikcom':
        return True
    elif akun == '@detikinet':
        return True
    elif akun == '@CNNIndonesia':
        return True
    elif akun == '@merdekadotcom':
        return True
    elif akun == '@tvOneNews':
        return True
    elif akun == '@antaranews':
        return True
    elif akun == '@Beritasatu':
        return True
    elif akun == '@cnbcindonesia':
        return True
    elif akun == '@liputan6dotcom':
        return True
    elif akun == '@okezonenews':
        return True
    elif akun == '@SINDOnews':
        return True
    elif akun == '@suaradotcom':
        return True
    elif akun == '@tempodotco':
        return True
    elif akun == '@tribunnews':
        return True
    elif akun == '@kumparan':
        return True
    elif akun == '@VIVAcoid':
        return True
    elif akun == '@republikaonline':
        return True
    elif akun == '@CNNIndonesia':
        return True
    elif akun == '@SonoraFM92':
        return True
    elif akun == '@officialliputan':
        return True
    elif akun == '@detikfinance':
        return True
    elif akun == '@tribunkaltim':
        return True
    elif akun == '@tribunnews':
        return True
    elif akun == '@maiwanews':
        return True
    elif akun == '@sbsinews':
        return True
    elif akun == '@tribunmedan':
        return True
    elif akun == '@Metro_TV':
        return True
    elif akun == '@mediablitar':
        return True
    elif akun == '@MoBMaB13':
        return True
    elif akun == '@Zuolinjie_TH':
        return True
    elif akun == '@inyourznx':
        return True
    elif akun == '@jaemin813_th':
        return True
    elif akun == '@cloudypolus':
        return True
    elif akun == '@m_alfqr':
        return True
    else:
        return False

In [75]:
df['berita'] = df['user_account'].apply(lambda x: berita(x))

In [None]:
# df.drop('berita', axis=1, inplace=True)

In [76]:
for i, row in df.iterrows():
    if row['berita'] == True:
        df = df.drop(i)
    

df.drop('berita', axis=1, inplace=True)

In [77]:
df.drop('ruu', axis=1, inplace=True)

In [78]:
df.to_csv('export.csv')

In [82]:
raw = pd.read_csv('export_dataset.csv')
df = pd.DataFrame(raw[['user_account', 'tweet', 'clean_tweet', 'sentimen']])
df['ruu'] = df['clean_tweet'].str.contains('rancang undang undang')

for i, row in df.iterrows():
    if row['ruu'] == False:
        df = df.drop(i)

df.drop('ruu', axis=1, inplace=True)
df['berita'] = df['user_account'].apply(lambda x: berita(x))

for i, row in df.iterrows():
    if row['berita'] == True:
        df = df.drop(i)
    

df.drop('berita', axis=1, inplace=True)
df.to_csv('export.csv')

In [3]:
raw = pd.read_csv('filter_dataset.csv')
df = pd.DataFrame(raw[['user_account', 'tweet', 'label']])
df = df.sample(frac = 1)

In [4]:
df.to_csv('filter_dataset1.csv')