# Analisis Sentimen Opini Masyarakat Terhadap Vaksinasi Nasional Menggunakan Metode Naïve Bayes dengan Seleksi Fitur TF-IDF (Bagian TF-IDF)

Jadi dalam notebook ini, kita preprocessing dulu datasetnya. Lalu kita split datasetnya (80:20) dan hitung TF-IDF untuk fitur yang akan digunakan di Naive Bayes

## Import library yang dibutuhkan
Kita butuh beberapa library biar semuanya lebih praktis dan perhitungannya efisien (ini yang paling penting). Tenang, semua library ini bukan "fancy library" kok. Ini semua library basic untuk text processing.

In [None]:
import pandas as pd
import numpy as np #untuk perhitungan array yang efisien
import re #regex (untuk manipulasi karakter yang ada dalam string)
import string 
from nltk.tokenize import word_tokenize #tokenisasi
from nltk.probability import FreqDist #menghitung jumlah kemunculan tiap kata dalam dokumen
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, ArrayDictionary #untuk mendapatkan list stopword
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory #stemming
import ast #untuk mengubah string yang ada di file .csv menjadi list
import seaborn as sns #visualisasi data
import matplotlib.pyplot as plt
from matplotlib import rc
import matplotlib as mpl
import csv
from numpy.random import RandomState

## Memuat training set
Untuk melatih sistem, kita akan pake training set dulu.

In [None]:
dataset = "dataset/dataset-tweet-vaksinasi-nasional.csv"
tweet_data = pd.read_csv(dataset)
tweet_data.head()

In [None]:
%matplotlib inline
sns.set(style='whitegrid', palette='muted', font_scale=1.5)
mpl.rcParams['figure.figsize'] = 14, 8

f = sns.countplot(x='label', data=tweet_data)
f.set_title("Sentiment Distribution")
f.set_xticklabels(['Negative', 'Positive'])
plt.xlabel("");

## Preprocessing data
Karena data dari twitter itu data mentah, kita preprocessing datanya dulu biar gampang diolah.

- Case Folding (lowercasing)
- Cleaning
- Tokenization
- Stopword removal
- Converting/normalization
- Stemming

### Case Folding

In [None]:
def lowercasing(text):
    return text.lower()

In [None]:
tweet_data['casefolding'] = tweet_data['tweet'].apply(lowercasing)
tweet_data['casefolding'].head()

### Cleaning

In [None]:
def cleaning(text):
    # hapus karakter tab, new line, dan back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # hapus karakter non-ASCII (emotikon, huruf bahasa cina, dll)
    text = text.encode('ascii', 'replace').decode('ascii')
    # hapus mention dan link
    text = ' '.join(re.sub("(@[A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # hapus URL yang tidak komplit
    text = text.replace("http://", " ").replace("https://", " ")
    # hapus whitespace leading & trailing dan multiple whitespace jadi single whitespace
    text = text.strip()
    text = re.sub('\s+',' ',text)
    # hapus tanda baca
    text = text.translate(str.maketrans("","",string.punctuation))
    # hapus angka
    cleaning_text = re.sub(r"\d+", "", text)
    return cleaning_text

In [None]:
tweet_data['cleaning'] = tweet_data['casefolding'].apply(cleaning)
tweet_data['cleaning'].head()

### Tokenization

In [None]:
# tokenisasi pake library NLTK (word_rokenize)
def tokenization(text):
    return word_tokenize(text)

# Distribusi frekuensi dari tiap kata dalam tiap tweet
def freqDist_wrapper(text):
    return FreqDist(text)

In [None]:
tweet_data['tokenization'] = tweet_data['cleaning'].apply(tokenization)
tweet_data['tokenization'].head()

In [None]:
tweet_data['tokens_freqdist'] = tweet_data['tokenization'].apply(freqDist_wrapper)
tweet_data['tokens_freqdist'].head().apply(lambda x : x.most_common())

### Stopword Removal

In [None]:
# dapetin stopword default dari library sastrawi
stopword_factory = StopWordRemoverFactory().get_stop_words()
# siapin stopword yang udah dibuat sendiri di .csv
stopword_csv = "csv files/stopwords.csv"
# pindahin stopword dari .csv ke list
more_stopword = []
with open(stopword_csv, newline='') as inputfile:
    for row in csv.reader(inputfile):
        more_stopword.append(row[0])

# gabungin stopword default dan stopword yang udah dibuat sendiri
stopword_data = stopword_factory + more_stopword
list_stopwords = set(stopword_data)

# hapus stopword yang ada ditiap tweet
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

In [None]:
tweet_data['stopword_removal'] = tweet_data['tokenization'].apply(stopwords_removal) 
tweet_data['stopword_removal'].head()

### Normalization/Converting

In [None]:
# dapetin kamus normalisasi yang udah dibuat sendiri
normalizad_word = pd.read_csv("csv files/normalization.csv")
# ubah csv ke dictionary
normalizad_word_dict = {}
for index, row in normalizad_word.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1] 

# normalisasi kata yang ada didalam tweet
def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]

In [None]:
tweet_data['converting'] = tweet_data['stopword_removal'].apply(normalized_term)
tweet_data['converting'].head()

### Stemming

In [None]:
# Buat stemmer dari sastrawi
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemming
def stemmed_wrapper(term):
    return stemmer.stem(term)

# yang ada dibawah ini cuma untuk ngeprint output aja
term_dict = {}

for document in tweet_data['converting']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))
print("-----------------------------------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])

In [None]:
# ini stemming juga tapi langsung ke dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

In [None]:
tweet_data['stemming'] = tweet_data['converting'].apply(get_stemmed_term)
tweet_data['stemming'].head()

Preprocessing selesai! Kita simpen dulu ke file .csv

In [None]:
tweet_data.to_csv("csv files/preprocessing.csv")

### Split Dataset

In [None]:
prepro_dataset = pd.read_csv("csv files/preprocessing.csv", usecols=["label", "stemming"])
prepro_dataset.columns = ["label", "tweet"]
prepro_dataset.head()

In [None]:
# Hapus semua hashtag dibawah kalo mau training set yang baru atau cara cepetnya blok semua yang dibawah abis itu ctrl+/

# train_set = prepro_dataset.sample(frac=0.8, random_state = RandomState())
# train_set = train.reset_index(drop=True)
# train_set.to_csv("csv files/train_80.csv", index=False)
# train_set

In [None]:
# Hapus semua hashtag dibawah kalo mau testing set yang baru atau cara cepetnya blok semua yang dibawah abis itu ctrl+/

# test_set = prepro_dataset.loc[~prepro_dataset.index.isin(train.index)]
# test_set = test.reset_index(drop=True)
# test_set.to_csv("csv files/test_20.csv", index=False)
# test_set

## Features Extraction: TF-IDF
Sebenernya fitur yang digunakan dalam Naive Bayes ini bisa aja menggunakan semua kata-kata yang ada didata. Tapi sekarang, kita pake fitur TF-IDF. Jadi bisa dibilang, TF-IDF ini tujuannya untuk mendapatkan kata-kata penting yang mengklasifikasikan sentimen dari sebuah tweet itu positif atau negatif. Bisa dibilang juga, kita mencari pola sentimen tweet dengan menggunakan TF-IDF.

### Import training set

In [None]:
train_csv = "csv files/train_80.csv"
train = pd.read_csv(train_csv)
train.head()

In [24]:
# Ambil tiap tweet yang ada didalam dataset (tetep dalam format list)
def convert_text_list(texts):
    texts = ast.literal_eval(texts)
    return [text for text in texts]

In [None]:
train["tweet_list"] = train["tweet"].apply(convert_text_list)
# menampilkan data ke 1 dalam bentuk list
print(train["tweet_list"][1])
print("\ntype : ", type(train["tweet_list"][1]))

In [None]:
# Hitung TF yang ada dalam tiap tweet
def calc_TF(document):
    # Menghitung berapa kali kata itu muncul dalam tweet
    TF_dict = {}
    for term in document:
        if term in TF_dict:
            TF_dict[term] += 1
        else:
            TF_dict[term] = 1
    # Hitung TF dari tiap kata
    for term in TF_dict:
        TF_dict[term] = TF_dict[term] / len(document)
    return TF_dict

In [None]:
train["TF_dict"] = train['tweet_list'].apply(calc_TF)
train["TF_dict"].head()

In [None]:
# Cek hasil TF dari index tertentu
index = 0

print('%20s' % "term", "\t", "TF\n")
for key in train["TF_dict"][index]:
    print('%20s' % key, "\t", train["TF_dict"][index][key])

In [None]:
# Cuma ngitung Document Frequency dari kata tersebut
def calc_DF(tfDict):
    count_DF = {}
    # Perulangan melalui dictionary tf dari setiap dokumen dan tambahkan pasangan countDict (tem, doc)
    for document in tfDict:
        for term in document:
            if term in count_DF:
                count_DF[term] += 1
            else:
                count_DF[term] = 1
    return count_DF

In [None]:
n_document = len(train)

# Disini baru deh dihitung IDF-nya
def calc_IDF(__n_document, __DF):
    IDF_Dict = {}
    for term in __DF:
        IDF_Dict[term] = np.log(__n_document / (__DF[term] + 1))
    return IDF_Dict

In [None]:
#Stores the idf dictionary
IDF = calc_IDF(n_document, DF)
print(IDF)

In [None]:
# Baru deh hitung TF-IDFnya
def calc_TF_IDF(TF):
    TF_IDF_Dict = {}
    #For each word in the review, we multiply its tf and its idf.
    for key in TF:
        TF_IDF_Dict[key] = TF[key] * IDF[key]
    return TF_IDF_Dict

#Stores the TF-IDF Series
train["TF-IDF_dict"] = train["TF_dict"].apply(calc_TF_IDF)
print(train["TF-IDF_dict"])

In [None]:
# Menampilkan TF dan TF-IDF dari tiap kata yang ada dalam tiap dokumen (contoh menampilkan pada index tertentu)
index = 1

print('%20s' % "term", "\t", '%10s' % "TF", "\t", '%20s' % "TF-IDF\n")
for key in train["TF-IDF_dict"][index]:
    print('%20s' % key, "\t", train["TF_dict"][index][key] ,"\t" , train["TF-IDF_dict"][index][key])

In [None]:
# Bagian ini emang susah untuk dimengerti, tapi intinya kita mau nambahin nilai TF-IDF dari tiap kata biar bisa dirangking

# urutkan berdasarkan nilai dictionary DF
sorted_DF = sorted(DF.items(), key=lambda kv: kv[1], reverse=True)

# Create a list of unique words from sorted dictionay `sorted_DF`
unique_term = [item[0] for item in sorted_DF]

def calc_TF_IDF_Vec(__TF_IDF_Dict):
    TF_IDF_vector = [0.0] * len(unique_term)

    # For each unique word, if it is in the review, store its TF-IDF value.
    for i, term in enumerate(unique_term):
        if term in __TF_IDF_Dict:
            TF_IDF_vector[i] = __TF_IDF_Dict[term]
    return TF_IDF_vector

train["TF_IDF_Vec"] = train["TF-IDF_dict"].apply(calc_TF_IDF_Vec)

print("print first row matrix TF_IDF_Vec Series\n")
print(train["TF_IDF_Vec"][0])

print("\nmatrix size : ", len(train["TF_IDF_Vec"][0]))

In [None]:
# Mengkonversi Series ke List
TF_IDF_Vec_List = np.array(train["TF_IDF_Vec"].to_list())

# Tambahkan elemen vektor yang ada di axix=0
sums = TF_IDF_Vec_List.sum(axis=0)
data = []
for col, term in enumerate(unique_term):
    data.append((term, sums[col]))

# Rangking!
ranking = pd.DataFrame(data, columns=['term', 'rank'])
features_rank = ranking.sort_values('rank', ascending=False)
features_rank.head(20)

TF-IDF selesai! Kita simpan ke file .csv

In [None]:
features_rank.to_csv("csv files/tfidf_rank_train80.csv", index=False)