## **Term Weighting Using TF-RF**

In [3]:
import pandas as pd
import numpy as np


- *load dataset that have been preprocessed*

### **Stemmed Dataset**

In [4]:
load_path = "../dataset/INA_TweetsPPKM_Preprocessed.csv"
df = pd.read_csv(load_path, sep="\t")
df["tweet"] = df["tweet"].apply(lambda word: eval(word))
df.head(10)


Unnamed: 0,tweet,sentiment
0,"[stimulasi, optimal, laku, online, penuh, butu...",0
1,"[ye, ampun, mbh, orang, wapres, lohampgelar, o...",0
2,"[mulaik, kumat]",0
3,"[welcome, ppkm, bau, bau, naik, level, nih, gi...",0
4,"[sungguh, allah, cipta, hamba, makhluk, lemah,...",0
5,"[negara, apa, sih, inippkm, mulu, otak, jabat,...",0
6,"[atur, ppkm, jelas, kalo, lawan, makin, jelas,...",0
7,"[udh, ga, musim, yg, msih, ngikut, brarti, bru...",0
8,"[hahaha, motogp, kelar, langsung, gedeg, bgt]",0
9,"[ppkm, jadi, landa, jangan, stress, depresi, p...",0


- *creating function to find term frequency and document frequency*

In [5]:
# 1. Display text data for each row
def showAll(column):
    for value in column:
        print(value)

# 2. Display data for a specific category only
def showOneCategory(typeColumn, textColumn, columnType):
    for text, category in zip(textColumn, typeColumn):
        if category == columnType:
            print(text)

# 3. Check if a term exists in the array or not (to avoid duplicates)
def check(array, word):
    return word not in array

# 4. Merge all preprocessed data into a single term list (without duplicates)
def termMerging(textColumn, termList):
    for text in textColumn:
        for term in text:
            if check(termList, term):
                termList.append(term)

# 5. Count the occurrence of each term in each document
def countTF(textColumn, typeColumn, termList, TFList):
    for text, category in zip(textColumn, typeColumn):
        docList = [text.count(term) for term in termList]
        docList.append(category)
        TFList.append(docList)

# 6. Change values greater than 1 to 1 for DF calculation
def changeValue(TFList, prepareDFList):
    for doc in TFList:
        dataList = [min(term, 1) for term in doc[:-1]]
        prepareDFList.append(dataList)

# 7. Count in how many documents a term appears (DF)
def countDF(TFList, DFList):
    for term in zip(*TFList):
        DFList.append(sum(term))


In [6]:
# merging all term in dataframe into one vector rows, without any double term
term = []
termMerging(termList=term, textColumn=df["tweet"])


In [8]:
# count how many term appear in each document
term_frequency = []
countTF(df["tweet"], df["sentiment"], term, term_frequency)
print(len(term_frequency))

10000


In [9]:
# change the value that more than 1 to 1, for calculating document frequency
tf_forDF = []
changeValue(term_frequency, tf_forDF)
print(len(tf_forDF))

10000


In [10]:
# count in how many document each term appear
document_frequency = []
countDF(tf_forDF, document_frequency)
print(len(document_frequency))


13827


### **_Find value for RF & TF-RF_**

<ol>
  <li>RF</li>
  <li>TF-RF</li>

</ol>


In [21]:
# method to count TF in each document category
def count_tf_category(dataTableCategory, termList, textColumn):
    tf_spam_list = []
    for i in range(len(dataTableCategory)):
        doclist = []
        for j in termList:
            count = 0
            for k in range(len(textColumn[i])):
                if j == textColumn[i][k]:
                    count += 1
            doclist.append(count)
        doclist.append(dataTableCategory["sentiment"][i])
        tf_spam_list.append(doclist)
    return tf_spam_list


In [12]:
# splitting tweets that only contain positive sentiment
df_positive = df[df["sentiment"] != 0].reset_index(drop=True)
df_positive.head()


Unnamed: 0,tweet,sentiment
0,"[sambut, positif, bijak, perintah, provinsi, j...",1
1,"[tahu, informasi, bagi, wilayah, jabar, dasar,...",1
2,"[beberapa, atur, giat, langsung, revisi, diy, ...",1
3,"[temanteman, tetap, disiplin, protokol, sehat,...",1
4,"[dki, jakarta, pasuk, laku, batas, giat, masya...",1


In [13]:
# splitting tweets that only contain negative sentiment
df_negative = df[df["sentiment"] != 1].reset_index(drop=True)
df_negative.head()


Unnamed: 0,tweet,sentiment
0,"[stimulasi, optimal, laku, online, penuh, butu...",0
1,"[ye, ampun, mbh, orang, wapres, lohampgelar, o...",0
2,"[mulaik, kumat]",0
3,"[welcome, ppkm, bau, bau, naik, level, nih, gi...",0
4,"[sungguh, allah, cipta, hamba, makhluk, lemah,...",0


In [23]:
# count tf value in tweets with positive sentiment
tf_positive_rf = count_tf_category(df_positive, term, df_positive["tweet"])
print(len(tf_positive_rf))


6100


In [24]:
# count tf value in tweets with negative sentiment
tf_negative_rf = count_tf_category(df_negative, term, df_negative["tweet"])
print(len(tf_negative_rf))


3900


In [25]:
tf_positive_df = []
changeValue(tf_positive_rf, tf_positive_df)
print(len(tf_positive_df))


6100


In [26]:
tf_negative_df = []
changeValue(tf_negative_rf, tf_negative_df)
print(len(tf_negative_df))


3900


In [27]:
nij = []
countDF(tf_positive_df, nij)
print(len(nij))


13827


In [28]:
nij_aksen = []
countDF(tf_negative_df, nij_aksen)
print(len(nij_aksen))


13827


In [29]:
# determining the maximum value between 1 and the value of nij accent
max_numerator = []
for i in range(len(nij_aksen)):
    temp = 0
    temp = max(1, nij_aksen[i])
    max_numerator.append(temp)
print(max_numerator)


[5, 7, 126, 30, 35, 69, 29, 78, 11, 25, 123, 37, 52, 24, 89, 20, 5, 6, 12, 1, 241, 2, 1, 1, 1, 12, 150, 13, 229, 46, 2, 2, 1, 4, 64, 1, 85, 9, 1, 9, 29, 63, 156, 1, 2, 4, 1074, 6, 50, 162, 118, 102, 6, 20, 37, 4, 6, 3, 9, 16, 26, 45, 65, 1, 15, 31, 26, 25, 3, 102, 323, 120, 1, 51, 16, 50, 189, 3, 1, 1, 79, 1, 1, 280, 1, 1, 1, 270, 24, 393, 1, 42, 48, 165, 16, 99, 1, 214, 32, 9, 435, 3, 2, 1, 1, 1, 2, 10, 2, 16, 47, 2, 23, 297, 1, 182, 19, 5, 48, 27, 70, 3, 69, 5, 3, 41, 68, 37, 46, 6, 88, 75, 126, 35, 8, 1, 10, 1, 8, 42, 4, 97, 7, 12, 1, 1, 200, 19, 2, 35, 17, 5, 129, 53, 13, 83, 1, 19, 22, 69, 34, 1, 138, 37, 104, 3, 12, 8, 9, 17, 515, 47, 40, 1, 41, 26, 287, 37, 28, 1, 15, 195, 1, 11, 33, 34, 11, 81, 5, 11, 2, 14, 83, 91, 8, 2, 89, 31, 48, 6, 1, 2, 31, 4, 50, 168, 4, 38, 11, 3, 7, 126, 31, 1, 1, 1, 2, 102, 8, 71, 56, 22, 1, 12, 19, 15, 35, 32, 5, 6, 122, 18, 81, 4, 7, 1, 21, 15, 6, 23, 3, 4, 18, 5, 37, 1, 19, 2, 10, 25, 6, 9, 11, 1, 8, 14, 19, 1, 2, 16, 1, 2, 3, 23, 1, 23, 1, 4, 5, 9

In [30]:
import math

# find rf value
rf_list = []
for i in range(len(nij)):
    temp = 0
    temp = math.log10(2 + (nij[i] / max_numerator[i]))
    temp = round(temp, 5)
    rf_list.append(temp)
print(rf_list)


[0.30103, 0.63202, 0.89963, 0.65321, 0.48936, 0.52099, 0.78558, 0.79005, 0.88289, 0.68485, 0.58587, 0.73283, 0.84271, 1.10551, 0.85201, 0.75205, 0.38021, 0.39794, 0.35218, 0.30103, 0.3903, 0.54407, 0.30103, 0.47712, 0.60206, 0.30103, 0.62325, 0.49884, 0.37974, 0.59731, 0.30103, 0.30103, 0.30103, 0.69897, 0.61707, 0.30103, 0.38864, 0.34679, 0.69897, 0.46073, 0.76032, 0.55477, 0.5673, 0.30103, 0.30103, 0.39794, 0.61872, 0.30103, 0.51851, 0.63122, 0.54302, 0.37161, 0.30103, 0.40654, 0.4318, 0.57403, 0.36798, 0.30103, 0.32451, 0.74527, 0.73425, 0.38021, 0.37178, 0.47712, 0.5721, 0.45312, 0.62642, 0.60638, 0.8653, 0.37522, 0.41919, 0.38021, 0.30103, 0.31362, 0.35218, 0.35025, 0.3719, 0.36798, 0.30103, 0.30103, 0.55416, 0.30103, 0.30103, 0.39794, 0.47712, 0.30103, 0.30103, 0.35468, 0.36015, 0.37456, 0.30103, 0.60206, 0.48608, 0.37023, 0.62839, 0.45461, 0.47712, 0.58709, 0.34611, 0.36798, 0.40172, 0.36798, 0.39794, 0.30103, 0.30103, 0.30103, 0.30103, 0.30103, 0.47712, 0.33995, 0.63753, 0.3010

In [31]:
# Combining the values of TF for spam and non-spam.
tf_forRF = []

for i in range(len(tf_positive_rf)):
    tf_forRF.append(tf_positive_rf[i])

for i in range(len(tf_negative_rf)):
    tf_forRF.append(tf_negative_rf[i])

print(len(tf_forRF))


10000


In [32]:
tf_rf = []
for i in range(len(tf_forRF)):
    doclist = []
    for j in range(len(tf_forRF[0])):
        if j == len(tf_forRF[0]) - 1:
            temp = tf_forRF[i][j]
            doclist.append(temp)
        else:
            temp = rf_list[j] * tf_forRF[i][j]
            doclist.append(temp)
    tf_rf.append(doclist)
print(len(tf_rf))


10000


### **_Save Dataframe To CSV_**


In [33]:
columnName = term
columnName.append('sentiment')

In [34]:
data = pd.DataFrame(tf_rf, columns=columnName)
data.to_csv("../dataset/INA_TweetsPPKM_TFRF.csv", index=False, sep="\t")


### **_Unstemmed Dataset_**


- _calculate TF, RF & TF-RF for unstemmed dataset to compare it later_


In [1]:
import pandas as pd

In [2]:
df_noStem = pd.read_csv(
    "../dataset/INA_TweetsPPKM_Preprocessed_NoStemming.csv", sep="\t"
)


In [3]:
df_noStem.head()

Unnamed: 0,tweet,sentiment
0,"['stimulasi', 'optimal', 'dilakukan', 'online'...",0
1,"['ye', 'ampun', 'mbh', 'seorang', 'wapres', 'l...",0
2,"['mulaik', 'kumat']",0
3,"['welcome', 'ppkm', 'bau', 'bau', 'naik', 'lev...",0
4,"['sesungguhnya', 'allah', 'menciptakan', 'hamb...",0
