# Import Libraries

In [1]:
import csv
import pandas as pd
import nltk
import re
import requests
import string

from io import StringIO
from IPython.display import clear_output, display, HTML
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from tqdm import tqdm

In [2]:
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Load Dataset

In [3]:
dir_ = "dataset/"
file_path = dir_ + 'oshibe_spv_comments_2025-01-15.csv'
df = pd.read_csv(file_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22996 entries, 0 to 22995
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ID          22996 non-null  object 
 1   ParentID    11079 non-null  object 
 2   Timestamp   22996 non-null  object 
 3   Username    22996 non-null  object 
 4   Comment     22993 non-null  object 
 5   LikeCount   22996 non-null  int64  
 6   ReplyCount  11917 non-null  float64
 7   Date        22996 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 1.4+ MB


# Data Preprocessing

## Data Types

In [4]:
# Convert data type of 'ReplyCount' to integer
df['ReplyCount'] = pd.to_numeric(df['ReplyCount'], errors='coerce').astype('Int64')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22996 entries, 0 to 22995
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          22996 non-null  object
 1   ParentID    11079 non-null  object
 2   Timestamp   22996 non-null  object
 3   Username    22996 non-null  object
 4   Comment     22993 non-null  object
 5   LikeCount   22996 non-null  int64 
 6   ReplyCount  11917 non-null  Int64 
 7   Date        22996 non-null  object
dtypes: Int64(1), int64(1), object(6)
memory usage: 1.4+ MB


## Missing Values

In [5]:
# Check Missing Values
df[df['Comment'].isnull()]

Unnamed: 0,ID,ParentID,Timestamp,Username,Comment,LikeCount,ReplyCount,Date
302,UgwUen0WTAZIqnC6hKJ4AaABAg,,2024-10-24T02:01:16Z,@ehasitijulaeha2522,,1,0,2024-10-24T02:01:16Z
584,UgwYhw0GzhZZSOTA8il4AaABAg,,2024-08-30T14:41:09Z,@stepme,,0,0,2024-08-30T14:41:09Z
18667,UgyQVu755DqSxX5PTIV4AaABAg,,2023-03-13T20:08:42Z,@ghandithesupremeleader9740,,4,0,2023-03-13T20:08:42Z


- Terdapat 3 baris dengan komentar kosong (kemungkinan hanya berisi karakter yang tidak berhasil di-encode di api call), ketiga baris ini bisa dihapus saja

In [6]:
df.dropna(subset=['Comment'], inplace=True)
df.shape

(22993, 8)

## Duplicated Data

In [7]:
# Drop Duplicates
df = df.drop_duplicates(subset=['Username', 'Comment']).reset_index(drop=True)
df.shape

(22160, 8)

- Komentar yang sama persis (duplikat) dari username yang sama juga cukup dipertahankan satu saja

## Top Level Comments

In [8]:
# Check Top Comments and retain the original 'ID' index
comments = df.copy()
comments = comments[comments['ParentID'].isnull()].sort_values(by=['LikeCount', 'ReplyCount'], ascending=False)
comments = comments[['ID', 'Username', 'Comment', 'LikeCount', 'ReplyCount', 'Date']]
comments

Unnamed: 0,ID,Username,Comment,LikeCount,ReplyCount,Date
13367,UgzWvu72I8m9-U8pq8F4AaABAg,@onthebluesky,"Guys, lagu ini bukan tentang LGBT, tapi tentan...",19405,751,2023-03-14T11:13:13Z
12665,Ugxb2yPnhvOFAaF_b2d4AaABAg,@driezkh,Performance Videonya kaya memberitahu kita ten...,2319,70,2023-03-20T17:56:27Z
20990,UgzCYP-5eQOScO828UZ4AaABAg,@adanjir1923,Satu persatu member diberikan kesempatan buat...,1885,63,2023-03-13T13:16:02Z
16159,UgwQ0xdL1_z3bGf9UM94AaABAg,@Jkt48990,"fiks, kalau kedepan jkt48 release single MVnya...",1863,80,2023-03-14T05:24:15Z
2359,UgyO_jkZ191_KXd7EUR4AaABAg,@ahmadfikri5186,Malam ini rahasia ya\nKamu tak boleh bilang si...,967,13,2023-10-21T18:45:55Z
...,...,...,...,...,...,...
22086,Ugyv4YBfwgcaua5rGux4AaABAg,@rizalfahri6435,"Apakah Shani jadi center lagi, ataukah dipanta...",0,0,2023-03-13T09:45:54Z
22098,UgwoO7UeC3qIc7KhmVd4AaABAg,@isnanyusuf3575,infokan,0,0,2023-03-13T09:36:15Z
22142,UgwJpMpkqqf_ABdjzHp4AaABAg,@johanafandi11,Nitip,0,0,2023-03-13T08:57:49Z
22150,UgwusAPX-itWdsA-SKh4AaABAg,@fahmiaditakurnia3734,ninggalin jejak,0,0,2023-03-13T08:51:32Z


## Comment Replies

In [9]:
# Check Replies
replies = df.copy()
replies = replies[replies['ParentID'].notnull()]
replies = replies[['ID', 'ParentID', 'Comment', 'LikeCount', 'Date']]
replies

Unnamed: 0,ID,ParentID,Comment,LikeCount,Date
11,UgxCK8DSLpRl2ZWP6pp4AaABAg.ACvSDxWXifuAD8OszKbyrr,UgxCK8DSLpRl2ZWP6pp4AaABAg,"Ini bukan lgbt, ini menceritakan tentang salah...",1,2025-01-11T03:26:35Z
12,UgxCK8DSLpRl2ZWP6pp4AaABAg.ACvSDxWXifuAD8PrrtD5HV,UgxCK8DSLpRl2ZWP6pp4AaABAg,@Christyyyy-bt5ps guru gembul aja bilang ini ...,0,2025-01-11T03:35:10Z
15,UgyO59JOKmUo-5QjmLB4AaABAg.ACuX5XgWREnACuijG8AjqW,UgyO59JOKmUo-5QjmLB4AaABAg,Jelas sekali bapak nya ga ngerti lagunya artin...,0,2025-01-05T10:40:09Z
16,UgyO59JOKmUo-5QjmLB4AaABAg.ACuX5XgWREnACukZrgQyyQ,UgyO59JOKmUo-5QjmLB4AaABAg,"@Melvinbryanchiri iya si ya, emang liriknya aj...",1,2025-01-05T10:56:12Z
23,Ugwb_ySgHtkmV4N2rzt4AaABAg.ACuQo4KFnXKACuRacBYOFR,Ugwb_ySgHtkmV4N2rzt4AaABAg,Kali ini bakal dari komunitas atau pengikut da...,0,2025-01-05T08:01:41Z
...,...,...,...,...,...
22130,UgxCXJhXOWtorqLHqCZ4AaABAg.9nBq1cHMtM79nBrWchLTdX,UgxCXJhXOWtorqLHqCZ4AaABAg,New song dalam keterangannya di Twitter,0,2023-03-13T09:35:37Z
22138,UgxRImDNvvZLHAQpMdN4AaABAg.9nBoDtxPza09nCGKl_liwE,UgxRImDNvvZLHAQpMdN4AaABAg,Tebakan yang sangat akurat,0,2023-03-13T13:21:11Z
22144,UgwbfBy7tSP4XWpmQjx4AaABAg.9nBn3nSxK9l9nBqKItxfvS,UgwbfBy7tSP4XWpmQjx4AaABAg,@@AbdulSalam-xe2cq kenapa woy 😑😑,0,2023-03-13T09:25:12Z
22148,Ugwxd5VGdiMxfkC4Ck14AaABAg.9nBmZqUHwKH9nBqClC7kHb,Ugwxd5VGdiMxfkC4Ck14AaABAg,Dh lewat ngav kwkw,0,2023-03-13T09:24:10Z


- Sentiment analysis hanya dilakukan pada top_level_comments (tidak termasuk replies/balasan komentar), karena top_level_comments inilah yang ditujukan untuk videonya

## Japanese Characters

In [10]:
# Function to check for Japanese characters in a string
def contains_japanese(text):
    return bool(re.search(r"[\u3040-\u30FF\u4E00-\u9FFF]", text))

japanese_rows = comments[comments['Comment'].apply(contains_japanese)]
japanese_rows

Unnamed: 0,ID,Username,Comment,LikeCount,ReplyCount,Date
12371,Ugxh_GTZZhdvvVeI_514AaABAg,@triadamas,電影攝影非常好，聲音和視覺效果都是傑作,67,3,2023-03-14T15:13:58Z
5477,UgweRdTljtSvfhA7xfB4AaABAg,@UnikUmbra,Fakta nya Lagu ini mempunyai Versi aslinya asa...,15,0,2023-03-23T04:31:39Z
12019,Ugx0-rG-HWFNxwY09Id4AaABAg,@kiami26202,オリジナル曲だと思ったらおしべ🦋でした,13,0,2023-03-14T19:15:53Z
5305,Ugwp2gvUDMIEqpuQ1y94AaABAg,@angieminipin3536,インドネシア人はアラビアンナイト好きなの？,11,4,2023-03-24T12:10:43Z
6566,UgzLCcGNWBDIa2kdoPt4AaABAg,@nasikamin,おしべとめしべと夜の蝶々,8,1,2023-03-18T14:18:25Z
19280,UgzpphrivqoxRuua3_J4AaABAg,@menshiro777,nice video !! 最高でしたー。　やっぱりjkt48は最高です。,7,0,2023-03-13T14:48:47Z
3840,UgwZ4z6P1V5R-ygfmQd4AaABAg,@halleyhuang5700,Pertama denger malah di SNH48 夜蝶 （kupu malam)\...,5,0,2023-04-26T23:21:44Z
9774,UgxSVoAshXQ3S1L_KF94AaABAg,@Ginojhisusei,すごい,5,0,2023-03-15T11:45:32Z
18541,UgwZISpkQIgciPvs8_d4AaABAg,@can-cn9vs,マーシャ可愛い❤センターおめでとう㊗️,5,0,2023-03-13T16:36:07Z
2582,UgyXvpM7_QrF5WJubeF4AaABAg,@wataru541604,アラビアン風素敵です。,2,1,2023-09-14T00:40:06Z


In [11]:
len(japanese_rows)

15

- Terdapat 15 komentar yang mengandung huruf Jepang, bagian komentar berhuruf Jepang ini akan dihapus

In [12]:
# Function to clean text by removing non-ASCII characters and reducing multiple spaces
def clean_japanese_text(text):
    # Remove non-ASCII characters
    text = re.sub(r"[^\x00-\x7F]+", '', text)
    # Remove Japanese characters (Hiragana, Katakana, Kanji)
    text = re.sub(r"[\u3040-\u30FF\u4E00-\u9FFF]+", '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Strip leading and trailing spaces
    return text.strip()

comments['Comment_clean'] = comments['Comment'].apply(clean_japanese_text)
comments[['Comment', 'Comment_clean']].head(15)

Unnamed: 0,Comment,Comment_clean
13367,"Guys, lagu ini bukan tentang LGBT, tapi tentan...","Guys, lagu ini bukan tentang LGBT, tapi tentan..."
12665,Performance Videonya kaya memberitahu kita ten...,Performance Videonya kaya memberitahu kita ten...
20990,Satu persatu member diberikan kesempatan buat...,Satu persatu member diberikan kesempatan buat ...
16159,"fiks, kalau kedepan jkt48 release single MVnya...","fiks, kalau kedepan jkt48 release single MVnya..."
2359,Malam ini rahasia ya\nKamu tak boleh bilang si...,Malam ini rahasia ya Kamu tak boleh bilang sia...
6792,"Terlepas dari kontroversi yang ada, sejujurnya...","Terlepas dari kontroversi yang ada, sejujurnya..."
21119,Terlepas dari hate comen 18+. Jujur ini suatu ...,Terlepas dari hate comen 18+. Jujur ini suatu ...
5083,"Gila konsep MV nya keren banget, good job JKT48","Gila konsep MV nya keren banget, good job JKT48"
20771,Congrats JKT48 NEW ERA atas mini albumnya. JKT...,Congrats JKT48 NEW ERA atas mini albumnya. JKT...
1882,"buay yg blg lesbi itu salah besar ya, ini tuh ...","buay yg blg lesbi itu salah besar ya, ini tuh ..."


In [13]:
# Display rows where the 'Comment_clean' column is empty (NaN or empty string)
empty_comments = comments[comments['Comment_clean'].isna() | (comments['Comment_clean'].str.strip() == '')]
empty_comments[['Comment', 'Comment_clean']]

Unnamed: 0,Comment,Comment_clean
12371,電影攝影非常好，聲音和視覺效果都是傑作,
12019,オリジナル曲だと思ったらおしべ🦋でした,
5305,インドネシア人はアラビアンナイト好きなの？,
7201,❤,
6566,おしべとめしべと夜の蝶々,
...,...,...
21748,😳,
21871,🔥🔥🔥,
21936,❤❤❤❤❤,
21944,🔥🔥,


- Komentar yang hanya menggunakan huruf Jepang atau emoticon akan dihapus

In [14]:
comments.shape

(11855, 7)

In [15]:
comments = comments[comments['Comment_clean'].notna() & (comments['Comment_clean'] != '')].copy()
comments.shape

(11650, 7)

## Cleaning, CaseFolding, Tokenizing, Remove Stopwords, toSentence

In [16]:
def cleaningText(text):
    # Step 1: Convert 'jkt48' to a temporary placeholder to protect it
    text = re.sub(r'\bjkt48\b', 'JKTSPECIAL', text, flags=re.IGNORECASE)
    
    # Step 2: Remove unwanted elements
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Remove mentions
    text = re.sub(r'#[A-Za-z0-9]+', '', text)  # Remove hashtags
    text = re.sub(r"http\S+", '', text)  # Remove links
    text = re.sub(r'(?<!JKTSPECIAL)[0-9]+', '', text)  # Remove numbers except in our placeholder
    
    # Step 3: Remove punctuations and extra spaces
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove all punctuations
    text = text.replace('\n', ' ')  # Replace new lines with space
    
    # Step 4: Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
    text = text.strip()  # Remove spaces from both ends of the text
    
    # Step 5: Restore 'jkt48' from placeholder
    text = text.replace('JKTSPECIAL', 'jkt48')
    
    return text

def casefoldingText(text): # Converting all the characters in a text into lower case
    text = text.lower()
    return text

def tokenizingText(text): # Tokenizing or splitting a string, text into a list of tokens
    text = word_tokenize(text)
    return text

def removeStopwords(text): # Remove stopwors in a text
    listStopwords = set(stopwords.words('indonesian'))
    listStopwords_en = set(stopwords.words('english'))
    listStopwords.update(listStopwords_en)
    listStopwords.update(['iya', 'yaa', "ya", "gak", 'nya', 'na', 'sih', 'ku', "di", "ga", "gaa", "gak", "loh", "kah", "woi", "woii", "woy"])
    filtered = []
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    text = filtered
    return text

def toSentence(list_words): # Mengubah daftar kata menjadi kalimat
    sentence = ' '.join(word for word in list_words)
    return sentence

In [17]:
text_test = '@arguto93 iya JKT48 makin keren cuy! Menit 1:03 Marsha jos banget'
text_clean = cleaningText(text_test)
text_clean

'iya jkt48 makin keren cuy Menit Marsha jos banget'

In [18]:
text_clean = casefoldingText(text_clean)
text_clean

'iya jkt48 makin keren cuy menit marsha jos banget'

In [19]:
text_clean = tokenizingText(text_clean)
text_clean

['iya', 'jkt48', 'makin', 'keren', 'cuy', 'menit', 'marsha', 'jos', 'banget']

In [20]:
text_clean = removeStopwords(text_clean)
text_clean

['jkt48', 'keren', 'cuy', 'menit', 'marsha', 'jos', 'banget']

In [21]:
text_clean = toSentence(text_clean)
text_clean

'jkt48 keren cuy menit marsha jos banget'

In [22]:
# Apply the cleaning, casefolding, tokenizing, stopword removal, sentence formatting
comments['Comment_clean_words'] = comments['Comment_clean'].apply(cleaningText)
comments['Comment_clean_words'] = comments['Comment_clean_words'].apply(casefoldingText)
comments['Comment_clean_words'] = comments['Comment_clean_words'].apply(tokenizingText)
comments['Comment_clean_words'] = comments['Comment_clean_words'].apply(removeStopwords)
comments['Comment_clean_words'] = comments['Comment_clean_words'].apply(toSentence)

comments[['Comment', 'Comment_clean', 'Comment_clean_words']].head(3)

Unnamed: 0,Comment,Comment_clean,Comment_clean_words
13367,"Guys, lagu ini bukan tentang LGBT, tapi tentan...","Guys, lagu ini bukan tentang LGBT, tapi tentan...",guys lagu lgbt gadis muda yg beranjak dewasa h...
12665,Performance Videonya kaya memberitahu kita ten...,Performance Videonya kaya memberitahu kita ten...,performance videonya kaya memberitahu dampak b...
20990,Satu persatu member diberikan kesempatan buat...,Satu persatu member diberikan kesempatan buat ...,persatu member kesempatan menunjukan potensiny...


## Stemming

In [23]:
def stemmingText(text):  # Reducing a word to its base form that attaches to suffixes and prefixes or to the roots of words
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    words = text.split()

    # Applying stemming to each word in the list
    stemmed_words = [stemmer.stem(word) for word in words]

    # Combining the stemmed words
    stemmed_text = ' '.join(stemmed_words)

    return stemmed_text

In [24]:
# comments.loc[:, 'Comment_clean_words'] = comments['Comment_clean_words'].apply(stemmingText)

# comments[['Comment', 'Comment_clean', 'Comment_clean_words']].head(3)

- Proses stemming memakan waktu terlalu lama

## Slangwords

In [25]:
slangwords = {"@": "di", "abis": "habis", "wtb": "beli", "masi": "masih", "wts": "jual", "wtt": "tukar", "bgt": "banget", "maks": "maksimal", "plisss": "tolong", "bgttt": "banget", "indo": "indonesia", "bgtt": "banget", "ad": "ada", "rv": "redvelvet", "plis": "tolong", "pls": "tolong", "cr": "sumber", "cod": "bayar ditempat", "adlh": "adalah", "afaik": "as far as i know", "ahaha": "haha", "aj": "saja", "ajep-ajep": "dunia gemerlap", "ak": "saya", "akika": "aku", "akkoh": "aku", "akuwh": "aku", "alay": "norak", "alow": "halo", "ambilin": "ambilkan", "ancur": "hancur", "anjrit": "anjing", "anter": "antar", "ap2": "apa-apa", "apasih": "apa sih", "apes": "sial", "aps": "apa", "aq": "saya", "aquwh": "aku", "asbun": "asal bunyi", "aseekk": "asyik", "asekk": "asyik", "asem": "asam", "aspal": "asli tetapi palsu", "astul": "asal tulis", "ato": "atau", "au ah": "tidak mau tahu", "awak": "saya", "ay": "sayang", "ayank": "sayang", "b4": "sebelum", "bakalan": "akan", "bandes": "bantuan desa", "bangedh": "banget", "banpol": "bantuan polisi", "banpur": "bantuan tempur", "basbang": "basi", "bcanda": "bercanda", "bdg": "bandung", "begajulan": "nakal", "beliin": "belikan", "bencong": "banci", "bentar": "sebentar", "ber3": "bertiga", "beresin": "membereskan", "bete": "bosan", "beud": "banget", "bg": "abang", "bgmn": "bagaimana", "bgt": "banget", "bijimane": "bagaimana", "bintal": "bimbingan mental", "bkl": "akan", "bknnya": "bukannya", "blegug": "bodoh", "blh": "boleh", "bln": "bulan", "blum": "belum", "bnci": "benci", "bnran": "yang benar", "bodor": "lucu", "bokap": "ayah", "boker": "buang air besar", "bokis": "bohong", "boljug": "boleh juga", "bonek": "bocah nekat", "boyeh": "boleh", "br": "baru", "brg": "bareng", "bro": "saudara laki-laki", "bru": "baru", "bs": "bisa", "bsen": "bosan", "bt": "buat", "btw": "ngomong-ngomong", "buaya": "tidak setia", "bubbu": "tidur", "bubu": "tidur", "bumil": "ibu hamil", "bw": "bawa", "bwt": "buat", "byk": "banyak", "byrin": "bayarkan", "cabal": "sabar", "cadas": "keren", "calo": "makelar", "can": "belum", "capcus": "pergi", "caper": "cari perhatian", "ce": "cewek", "cekal": "cegah tangkal", "cemen": "penakut", "cengengesan": "tertawa", "cepet": "cepat", "cew": "cewek", "chuyunk": "sayang", "cimeng": "ganja", "cipika cipiki": "cium pipi kanan cium pipi kiri", "ciyh": "sih", "ckepp": "cakep", "ckp": "cakep", "cmiiw": "correct me if i'm wrong", "cmpur": "campur", "cong": "banci", "conlok": "cinta lokasi", "cowwyy": "maaf", "cp": "siapa", "cpe": "capek", "cppe": "capek", "cucok": "cocok", "cuex": "cuek", "cumi": "Cuma miscall", "cups": "culun", "curanmor": "pencurian kendaraan bermotor", "curcol": "curahan hati colongan", "cwek": "cewek", "cyin": "cinta", "d": "di", "dah": "deh", "dapet": "dapat", "de": "adik", "dek": "adik", "demen": "suka", "deyh": "deh", "dgn": "dengan", "diancurin": "dihancurkan", "dimaafin": "dimaafkan", "dimintak": "diminta", "disono": "di sana", "dket": "dekat", "dkk": "dan kawan-kawan", "dll": "dan lain-lain", "dlu": "dulu", "dngn": "dengan", "dodol": "bodoh", "doku": "uang", "dongs": "dong", "dpt": "dapat", "dri": "dari", "drmn": "darimana", "drtd": "dari tadi", "dst": "dan seterusnya", "dtg": "datang", "duh": "aduh", "duren": "durian", "ed": "edisi", "egp": "emang gue pikirin", "eke": "aku", "elu": "kamu", "emangnya": "memangnya", "emng": "memang", "endak": "tidak", "enggak": "tidak", "envy": "iri", "ex": "mantan", "fax": "facsimile", "fifo": "first in first out", "folbek": "follow back", "fyi": "sebagai informasi", "gaada": "tidak ada uang", "gag": "tidak", "gaje": "tidak jelas", "gak papa": "tidak apa-apa", "gan": "juragan", "gaptek": "gagap teknologi", "gatek": "gagap teknologi", "gawe": "kerja", "gbs": "tidak bisa", "gebetan": "orang yang disuka", "geje": "tidak jelas", "gepeng": "gelandangan dan pengemis", "ghiy": "lagi", "gile": "gila", "gimana": "bagaimana", "gino": "gigi nongol", "githu": "gitu", "gj": "tidak jelas", "gmana": "bagaimana", "gn": "begini", "goblok": "bodoh", "golput": "golongan putih", "gowes": "mengayuh sepeda", "gpny": "tidak punya", "gr": "gede rasa", "gretongan": "gratisan", "gtau": "tidak tahu", "gua": "saya", "guoblok": "goblok", "gw": "saya", "ha": "tertawa", "haha": "tertawa", "hallow": "halo", "hankam": "pertahanan dan keamanan", "hehe": "he", "helo": "halo", "hey": "hai", "hlm": "halaman", "hny": "hanya", "hoax": "isu bohong", "hr": "hari", "hrus": "harus", "hubdar": "perhubungan darat", "huff": "mengeluh", "hum": "rumah", "humz": "rumah", "ilang": "hilang", "ilfil": "tidak suka", "imho": "in my humble opinion", "imoetz": "imut", "item": "hitam", "itungan": "hitungan", "iye": "iya", "ja": "saja", "jadiin": "jadi", "jaim": "jaga image", "jayus": "tidak lucu", "jdi": "jadi", "jem": "jam", "jga": "juga", "jgnkan": "jangankan", "jir": "anjing", "jln": "jalan", "jomblo": "tidak punya pacar", "jubir": "juru bicara", "jutek": "galak", "k": "ke", "kab": "kabupaten", "kabor": "kabur", "kacrut": "kacau", "kadiv": "kepala divisi", "kagak": "tidak", "kalo": "kalau", "kampret": "sialan", "kamtibmas": "keamanan dan ketertiban masyarakat", "kamuwh": "kamu", "kanwil": "kantor wilayah", "karna": "karena", "kasubbag": "kepala subbagian", "katrok": "kampungan", "kayanya": "kayaknya", "kbr": "kabar", "kdu": "harus", "kec": "kecamatan", "kejurnas": "kejuaraan nasional", "kekeuh": "keras kepala", "kel": "kelurahan", "kemaren": "kemarin", "kepengen": "mau", "kepingin": "mau", "kepsek": "kepala sekolah", "kesbang": "kesatuan bangsa", "kesra": "kesejahteraan rakyat", "ketrima": "diterima", "kgiatan": "kegiatan", "kibul": "bohong", "kimpoi": "kawin", "kl": "kalau", "klianz": "kalian", "kloter": "kelompok terbang", "klw": "kalau", "km": "kamu", "kmps": "kampus", "kmrn": "kemarin", "knal": "kenal", "knp": "kenapa", "kodya": "kota madya", "komdis": "komisi disiplin", "komsov": "komunis sovyet", "kongkow": "kumpul bareng teman-teman", "kopdar": "kopi darat", "korup": "korupsi", "kpn": "kapan", "krenz": "keren", "krm": "kirim", "kt": "kita", "ktmu": "ketemu", "ktr": "kantor", "kuper": "kurang pergaulan", "kw": "imitasi", "kyk": "seperti", "la": "lah", "lam": "salam", "lamp": "lampiran", "lanud": "landasan udara", "latgab": "latihan gabungan", "lebay": "berlebihan", "leh": "boleh", "lelet": "lambat", "lemot": "lambat", "lgi": "lagi", "lgsg": "langsung", "liat": "lihat", "litbang": "penelitian dan pengembangan", "lmyn": "lumayan", "lo": "kamu", "loe": "kamu", "lola": "lambat berfikir", "louph": "cinta", "low": "kalau", "lp": "lupa", "luber": "langsung, umum, bebas, dan rahasia", "luchuw": "lucu", "lum": "belum", "luthu": "lucu", "lwn": "lawan", "maacih": "terima kasih", "mabal": "bolos", "macem": "macam", "macih": "masih", "maem": "makan", "magabut": "makan gaji buta", "maho": "homo", "mak jang": "kaget", "maksain": "memaksa", "malem": "malam", "mam": "makan", "maneh": "kamu", "maniez": "manis", "mao": "mau", "masukin": "masukkan", "melu": "ikut", "mepet": "dekat sekali", "mgu": "minggu", "migas": "minyak dan gas bumi", "mikol": "minuman beralkohol", "miras": "minuman keras", "mlah": "malah", "mngkn": "mungkin", "mo": "mau", "mokad": "mati", "moso": "masa", "mpe": "sampai", "msk": "masuk", "mslh": "masalah", "mt": "makan teman", "mubes": "musyawarah besar", "mulu": "melulu", "mumpung": "selagi", "munas": "musyawarah nasional", "muntaber": "muntah dan berak", "musti": "mesti", "muupz": "maaf", "mw": "now watching", "n": "dan", "nanam": "menanam", "nanya": "bertanya", "napa": "kenapa", "napi": "narapidana", "napza": "narkotika, alkohol, psikotropika, dan zat adiktif ", "narkoba": "narkotika, psikotropika, dan obat terlarang", "nasgor": "nasi goreng", "nda": "tidak", "ndiri": "sendiri", "ne": "ini", "nekolin": "neokolonialisme", "nembak": "menyatakan cinta", "ngabuburit": "menunggu berbuka puasa", "ngaku": "mengaku", "ngambil": "mengambil", "nganggur": "tidak punya pekerjaan", "ngapah": "kenapa", "ngaret": "terlambat", "ngasih": "memberikan", "ngebandel": "berbuat bandel", "ngegosip": "bergosip", "ngeklaim": "mengklaim", "ngeksis": "menjadi eksis", "ngeles": "berkilah", "ngelidur": "menggigau", "ngerampok": "merampok", "ngga": "tidak", "ngibul": "berbohong", "ngiler": "mau", "ngiri": "iri", "ngisiin": "mengisikan", "ngmng": "bicara", "ngomong": "bicara", "ngubek2": "mencari-cari", "ngurus": "mengurus", "nie": "ini", "nih": "ini", "niyh": "nih", "nmr": "nomor", "nntn": "nonton", "nobar": "nonton bareng", "np": "now playing", "ntar": "nanti", "ntn": "nonton", "numpuk": "bertumpuk", "nutupin": "menutupi", "nyari": "mencari", "nyekar": "menyekar", "nyicil": "mencicil", "nyoblos": "mencoblos", "nyokap": "ibu", "ogah": "tidak mau", "ol": "online", "ongkir": "ongkos kirim", "oot": "out of topic", "org2": "orang-orang", "ortu": "orang tua", "otda": "otonomi daerah", "otw": "on the way, sedang di jalan", "pacal": "pacar", "pake": "pakai", "pala": "kepala", "pansus": "panitia khusus", "parpol": "partai politik", "pasutri": "pasangan suami istri", "pd": "pada", "pede": "percaya diri", "pelatnas": "pemusatan latihan nasional", "pemda": "pemerintah daerah", "pemkot": "pemerintah kota", "pemred": "pemimpin redaksi", "penjas": "pendidikan jasmani", "perda": "peraturan daerah", "perhatiin": "perhatikan", "pesenan": "pesanan", "pgang": "pegang", "pi": "tapi", "pilkada": "pemilihan kepala daerah", "pisan": "sangat", "pk": "penjahat kelamin", "plg": "paling", "pmrnth": "pemerintah", "polantas": "polisi lalu lintas", "ponpes": "pondok pesantren", "pp": "pulang pergi", "prg": "pergi", "prnh": "pernah", "psen": "pesan", "pst": "pasti", "pswt": "pesawat", "pw": "posisi nyaman", "qmu": "kamu", "rakor": "rapat koordinasi", "ranmor": "kendaraan bermotor", "re": "reply", "ref": "referensi", "rehab": "rehabilitasi", "rempong": "sulit", "repp": "balas", "restik": "reserse narkotika", "rhs": "rahasia", "rmh": "rumah", "ru": "baru", "ruko": "rumah toko", "rusunawa": "rumah susun sewa", "ruz": "terus", "saia": "saya", "salting": "salah tingkah", "sampe": "sampai", "samsek": "sama sekali", "sapose": "siapa", "satpam": "satuan pengamanan", "sbb": "sebagai berikut", "sbh": "sebuah", "sbnrny": "sebenarnya", "scr": "secara", "sdgkn": "sedangkan", "sdkt": "sedikit", "se7": "setuju", "sebelas dua belas": "mirip", "sembako": "sembilan bahan pokok", "sempet": "sempat", "sendratari": "seni drama tari", "sgt": "sangat", "shg": "sehingga", "siech": "sih", "sikon": "situasi dan kondisi", "sinetron": "sinema elektronik", "siramin": "siramkan", "sj": "saja", "skalian": "sekalian", "sklh": "sekolah", "skt": "sakit", "slesai": "selesai", "sll": "selalu", "slma": "selama", "slsai": "selesai", "smpt": "sempat", "smw": "semua", "sndiri": "sendiri", "soljum": "sholat jumat", "songong": "sombong", "sory": "maaf", "sosek": "sosial-ekonomi", "sotoy": "sok tahu", "spa": "siapa", "sppa": "siapa", "spt": "seperti", "srtfkt": "sertifikat", "stiap": "setiap", "stlh": "setelah", "suk": "masuk", "sumpek": "sempit", "syg": "sayang", "t4": "tempat", "tajir": "kaya", "tau": "tahu", "taw": "tahu", "td": "tadi", "tdk": "tidak", "teh": "kakak perempuan", "telat": "terlambat", "telmi": "telat berpikir", "temen": "teman", "tengil": "menyebalkan", "tepar": "terkapar", "tggu": "tunggu", "tgu": "tunggu", "thankz": "terima kasih", "thn": "tahun", "tilang": "bukti pelanggaran", "tipiwan": "TvOne", "tks": "terima kasih", "tlp": "telepon", "tls": "tulis", "tmbah": "tambah", "tmen2": "teman-teman", "tmpah": "tumpah", "tmpt": "tempat", "tngu": "tunggu", "tnyta": "ternyata", "tokai": "tai", "toserba": "toko serba ada", "tpi": "tapi", "trdhulu": "terdahulu", "trima": "terima kasih", "trm": "terima", "trs": "terus", "trutama": "terutama", "ts": "penulis", "tst": "tahu sama tahu", "ttg": "tentang", "tuch": "tuh", "tuir": "tua", "tw": "tahu", "u": "kamu", "ud": "sudah", "udah": "sudah", "ujg": "ujung", "ul": "ulangan", "unyu": "lucu", "uplot": "unggah", "urang": "saya", "usah": "perlu", "utk": "untuk", "valas": "valuta asing", "w/": "dengan", "wadir": "wakil direktur", "wamil": "wajib militer", "warkop": "warung kopi", "warteg": "warung tegal", "wat": "buat", "wkt": "waktu", "wtf": "what the fuck", "xixixi": "tertawa", "ya": "iya", "yap": "iya", "yaudah": "ya sudah", "yawdah": "ya sudah", "yg": "yang", "yl": "yang lain", "yo": "iya", "yowes": "ya sudah", "yup": "iya", "7an": "tujuan", "ababil": "abg labil", "acc": "accord", "adlah": "adalah", "adoh": "aduh", "aha": "tertawa", "aing": "saya", "aja": "saja", "ajj": "saja", "aka": "dikenal juga sebagai", "akko": "aku", "akku": "aku", "akyu": "aku", "aljasa": "asal jadi saja", "ama": "sama", "ambl": "ambil", "anjir": "anjing", "ank": "anak", "ap": "apa", "apaan": "apa", "ape": "apa", "aplot": "unggah", "apva": "apa", "aqu": "aku", "asap": "sesegera mungkin", "aseek": "asyik", "asek": "asyik", "aseknya": "asyiknya", "asoy": "asyik", "astrojim": "astagfirullahaladzim", "ath": "kalau begitu", "atuh": "kalau begitu", "ava": "avatar", "aws": "awas", "ayang": "sayang", "ayok": "ayo", "bacot": "banyak bicara", "bales": "balas", "bangdes": "pembangunan desa", "bangkotan": "tua", "banpres": "bantuan presiden", "bansarkas": "bantuan sarana kesehatan", "bazis": "badan amal, zakat, infak, dan sedekah", "bcoz": "karena", "beb": "sayang", "bejibun": "banyak", "belom": "belum", "bener": "benar", "ber2": "berdua", "berdikari": "berdiri di atas kaki sendiri", "bet": "banget", "beti": "beda tipis", "beut": "banget", "bgd": "banget", "bgs": "bagus", "bhubu": "tidur", "bimbuluh": "bimbingan dan penyuluhan", "bisi": "kalau-kalau", "bkn": "bukan", "bl": "beli", "blg": "bilang", "blm": "belum", "bls": "balas", "bnchi": "benci", "bngung": "bingung", "bnyk": "banyak", "bohay": "badan aduhai", "bokep": "porno", "bokin": "pacar", "bole": "boleh", "bolot": "bodoh", "bonyok": "ayah ibu", "bpk": "bapak", "brb": "segera kembali", "brngkt": "berangkat", "brp": "berapa", "brur": "saudara laki-laki", "bsa": "bisa", "bsk": "besok", "bu_bu": "tidur", "bubarin": "bubarkan", "buber": "buka bersama", "bujubune": "luar biasa", "buser": "buru sergap", "bwhn": "bawahan", "byar": "bayar", "byr": "bayar", "c8": "chat", "cabut": "pergi", "caem": "cakep", "cama-cama": "sama-sama", "cangcut": "celana dalam", "cape": "capek", "caur": "jelek", "cekak": "tidak ada uang", "cekidot": "coba lihat", "cemplungin": "cemplungkan", "ceper": "pendek", "ceu": "kakak perempuan", "cewe": "cewek", "cibuk": "sibuk", "cin": "cinta", "ciye": "cie", "ckck": "ck", "clbk": "cinta lama bersemi kembali", "cmpr": "campur", "cnenk": "senang", "congor": "mulut", "cow": "cowok", "coz": "karena", "cpa": "siapa", "gokil": "gila", "gombal": "suka merayu", "gpl": "tidak pakai lama", "gpp": "tidak apa-apa", "gretong": "gratis", "gt": "begitu", "gtw": "tidak tahu", "gue": "saya", "guys": "teman-teman", "gws": "cepat sembuh", "haghaghag": "tertawa", "hakhak": "tertawa", "handak": "bahan peledak", "hansip": "pertahanan sipil", "hellow": "halo", "helow": "halo", "hi": "hai", "hlng": "hilang", "hnya": "hanya", "houm": "rumah", "hrs": "harus", "hubad": "hubungan angkatan darat", "hubla": "perhubungan laut", "huft": "mengeluh", "humas": "hubungan masyarakat", "idk": "saya tidak tahu", "ilfeel": "tidak suka", "imba": "jago sekali", "imoet": "imut", "info": "informasi", "itung": "hitung", "isengin": "bercanda", "iyala": "iya lah", "iyo": "iya", "jablay": "jarang dibelai", "jadul": "jaman dulu", "jancuk": "anjing", "jd": "jadi", "jdikan": "jadikan", "jg": "juga", "jgn": "jangan", "jijay": "jijik", "jnj": "janji", "jth": "jatuh", "jurdil": "jujur adil", "jwb": "jawab", "ka": "kakak", "kabag": "kepala bagian", "kacian": "kasihan", "kadit": "kepala direktorat", "kaga": "tidak", "kaka": "kakak", "kamtib": "keamanan dan ketertiban", "kamuh": "kamu", "kamyu": "kamu", "kapt": "kapten", "kasat": "kepala satuan", "kasubbid": "kepala subbidang", "kau": "kamu", "kbar": "kabar", "kcian": "kasihan", "keburu": "terlanjur", "kedubes": "kedutaan besar", "kek": "seperti", "keknya": "kayaknya", "keliatan": "kelihatan", "keneh": "masih", "kepikiran": "terpikirkan", "kepo": "mau tahu urusan orang", "kere": "tidak punya uang", "kesian": "kasihan", "ketauan": "ketahuan", "keukeuh": "keras kepala", "khan": "kan", "kibus": "kaki busuk", "kk": "kakak", "klian": "kalian", "klo": "kalau", "kluarga": "keluarga", "klwrga": "keluarga", "kmari": "kemari", "kmpus": "kampus", "kn": "kan", "knl": "kenal", "knpa": "kenapa", "kog": "kok", "kompi": "komputer", "komtiong": "komunis Tiongkok", "konjen": "konsulat jenderal", "koq": "kok", "kpd": "kepada", "kptsan": "keputusan", "krik": "garing", "krn": "karena", "ktauan": "ketahuan", "ktny": "katanya", "kudu": "harus", "kuq": "kok", "ky": "seperti", "kykny": "kayanya", "laka": "kecelakaan", "lambreta": "lambat", "lansia": "lanjut usia", "lapas": "lembaga pemasyarakatan", "lbur": "libur", "lekong": "laki-laki", "lg": "lagi", "lgkp": "lengkap", "lht": "lihat", "linmas": "perlindungan masyarakat", "lmyan": "lumayan", "lngkp": "lengkap", "loch": "loh", "lol": "tertawa", "lom": "belum", "loupz": "cinta", "lowh": "kamu", "lu": "kamu", "luchu": "lucu", "luff": "cinta", "luph": "cinta", "lw": "kamu", "lwt": "lewat", "maaciw": "terima kasih", "mabes": "markas besar", "macem-macem": "macam-macam", "madesu": "masa depan suram", "maen": "main", "mahatma": "maju sehat bersama", "mak": "ibu", "makasih": "terima kasih", "malah": "bahkan", "malu2in": "memalukan", "mamz": "makan", "manies": "manis", "mantep": "mantap", "markus": "makelar kasus", "mba": "mbak", "mending": "lebih baik", "mgkn": "mungkin", "mhn": "mohon", "miker": "minuman keras", "milis": "mailing list", "mksd": "maksud", "mls": "malas", "mnt": "minta", "moge": "motor gede", "mokat": "mati", "mosok": "masa", "msh": "masih", "mskpn": "meskipun", "msng2": "masing-masing", "muahal": "mahal", "muker": "musyawarah kerja", "mumet": "pusing", "muna": "munafik", "munaslub": "musyawarah nasional luar biasa", "musda": "musyawarah daerah", "muup": "maaf", "muuv": "maaf", "nal": "kenal", "nangis": "menangis", "naon": "apa", "napol": "narapidana politik", "naq": "anak", "narsis": "bangga pada diri sendiri", "nax": "anak", "ndak": "tidak", "ndut": "gendut", "nekolim": "neokolonialisme", "nelfon": "menelepon", "ngabis2in": "menghabiskan", "ngakak": "tertawa", "ngambek": "marah", "ngampus": "pergi ke kampus", "ngantri": "mengantri", "ngapain": "sedang apa", "ngaruh": "berpengaruh", "ngawur": "berbicara sembarangan", "ngeceng": "kumpul bareng-bareng", "ngeh": "sadar", "ngekos": "tinggal di kos", "ngelamar": "melamar", "ngeliat": "melihat", "ngemeng": "bicara terus-terusan", "ngerti": "mengerti", "nggak": "tidak", "ngikut": "ikut", "nginep": "menginap", "ngisi": "mengisi", "ngmg": "bicara", "ngocol": "lucu", "ngomongin": "membicarakan", "ngumpul": "berkumpul", "ni": "ini", "nyasar": "tersesat", "nyariin": "mencari", "nyiapin": "mempersiapkan", "nyiram": "menyiram", "nyok": "ayo", "o/": "oleh", "ok": "ok", "priksa": "periksa", "pro": "profesional", "psn": "pesan", "psti": "pasti", "puanas": "panas", "qmo": "kamu", "qt": "kita", "rame": "ramai", "raskin": "rakyat miskin", "red": "redaksi", "reg": "register", "rejeki": "rezeki", "renstra": "rencana strategis", "reskrim": "reserse kriminal", "sni": "sini", "somse": "sombong sekali", "sorry": "maaf", "sosbud": "sosial-budaya", "sospol": "sosial-politik", "sowry": "maaf", "spd": "sepeda", "sprti": "seperti", "spy": "supaya", "stelah": "setelah", "subbag": "subbagian", "sumbangin": "sumbangkan", "sy": "saya", "syp": "siapa", "tabanas": "tabungan pembangunan nasional", "tar": "nanti", "taun": "tahun", "tawh": "tahu", "tdi": "tadi", "te2p": "tetap", "tekor": "rugi", "telkom": "telekomunikasi", "telp": "telepon", "temen2": "teman-teman", "tengok": "menjenguk", "terbitin": "terbitkan", "tgl": "tanggal", "thanks": "terima kasih", "thd": "terhadap", "thx": "terima kasih", "tipi": "TV", "tkg": "tukang", "tll": "terlalu", "tlpn": "telepon", "tman": "teman", "tmbh": "tambah", "tmn2": "teman-teman", "tmph": "tumpah", "tnda": "tanda", "tnh": "tanah", "togel": "toto gelap", "tp": "tapi", "tq": "terima kasih", "trgntg": "tergantung", "trims": "terima kasih", "cb": "coba", "y": "ya", "munfik": "munafik", "reklamuk": "reklamasi", "sma": "sama", "tren": "trend", "ngehe": "kesal", "mz": "mas", "analisise": "analisis", "sadaar": "sadar", "sept": "september", "nmenarik": "menarik", "zonk": "bodoh", "rights": "benar", "simiskin": "miskin", "ngumpet": "sembunyi", "hardcore": "keras", "akhirx": "akhirnya", "solve": "solusi", "watuk": "batuk", "ngebully": "intimidasi", "masy": "masyarakat", "still": "masih", "tauk": "tahu", "mbual": "bual", "tioghoa": "tionghoa", "ngentotin": "senggama", "kentot": "senggama", "faktakta": "fakta", "sohib": "teman", "rubahnn": "rubah", "trlalu": "terlalu", "nyela": "cela", "heters": "pembenci", "nyembah": "sembah", "most": "paling", "ikon": "lambang", "light": "terang", "pndukung": "pendukung", "setting": "atur", "seting": "akting", "next": "lanjut", "waspadalah": "waspada", "gantengsaya": "ganteng", "parte": "partai", "nyerang": "serang", "nipu": "tipu", "ktipu": "tipu", "jentelmen": "berani", "buangbuang": "buang", "tsangka": "tersangka", "kurng": "kurang", "ista": "nista", "less": "kurang", "koar": "teriak", "paranoid": "takut", "problem": "masalah", "tahi": "kotoran", "tirani": "tiran", "tilep": "tilap", "happy": "bahagia", "tak": "tidak", "penertiban": "tertib", "uasai": "kuasa", "mnolak": "tolak", "trending": "trend", "taik": "tahi", "wkwk": "tertawa", "wkwkwk": "tertawa", "wkwkwkwk": "tertawa", "mgkin": "mungkin"}

slang_df = pd.DataFrame(list(slangwords.items()), columns=["slang", "fix"])
file_slang_fix = "indonesian_slangwords_fix.csv"
slang_df.to_csv(file_slang_fix, index=False)

print(f"Slang words saved to {file_slang_fix}")

Slang words saved to indonesian_slangwords_fix.csv


## Fix Slangwords

In [26]:
def fixSlangwords(text):
    slang_df = pd.read_csv(file_slang_fix)
    slangwords = dict(zip(slang_df['slang'], slang_df['fix']))
    
    words = text.split()
    fixed_words = []

    for word in words:
        if word.lower() in slangwords:
            fixed_words.append(slangwords[word.lower()])
        else:
            fixed_words.append(word)

    fixed_text = ' '.join(fixed_words)
    return fixed_text

In [27]:
# Test usage
text_with_slang = "wkwk mgkin akku memang tajir bgt sampe bs ngefans jkt48 10 tahun"
fixed_text = fixSlangwords(text_with_slang)
print(f"{text_with_slang} -> {fixed_text}")

wkwk mgkin akku memang tajir bgt sampe bs ngefans jkt48 10 tahun -> tertawa mungkin aku memang kaya banget sampai bisa ngefans jkt48 10 tahun


In [28]:
comments['Comment_clean_words'] = comments['Comment_clean_words'].apply(fixSlangwords)

comments[['Comment', 'Comment_clean', 'Comment_clean_words']].head(3)

Unnamed: 0,Comment,Comment_clean,Comment_clean_words
13367,"Guys, lagu ini bukan tentang LGBT, tapi tentan...","Guys, lagu ini bukan tentang LGBT, tapi tentan...",teman-teman lagu lgbt gadis muda yang beranjak...
12665,Performance Videonya kaya memberitahu kita ten...,Performance Videonya kaya memberitahu kita ten...,performance videonya kaya memberitahu dampak b...
20990,Satu persatu member diberikan kesempatan buat...,Satu persatu member diberikan kesempatan buat ...,persatu member kesempatan menunjukan potensiny...


## Fix Common Typos

In [29]:
def fixCommonTypos(text):
    text = re.sub(r'(.)\1{2,}', r'\1', text)  # Reduces three or more of the same character to one
    return text

In [30]:
print(fixCommonTypos("bagussss bangeeetttttt"))
print(fixCommonTypos("kereeeennn"))
print(fixCommonTypos("kerenn"))

bagus banget
keren
kerenn


- ya, fungsi ini tidak sempurna, but at least we tried to minimize typos

In [31]:
comments['Comment_clean_words'] = comments['Comment_clean_words'].apply(fixCommonTypos)

# Data Labelling

## Labelling with Lexicon

In [32]:
# Fetch ID-OpinionWords lexicon
def fetch_word_list(url):
    word_list = set()
    response = requests.get(url)
    if response.status_code == 200:
        reader = csv.reader(StringIO(response.text))
        for row in reader:
            word_list.add(row[0])  # Add the word (row[0]) to the set
    else:
        print(f"Failed to fetch word list from {url}")
    return word_list

# URLs for ID-OpinionWords lexicons
positive_url = "https://raw.githubusercontent.com/masdevid/ID-OpinionWords/refs/heads/master/positive.txt"
negative_url = "https://raw.githubusercontent.com/masdevid/ID-OpinionWords/refs/heads/master/negative.txt"

# Load lexicons
lexicon_positive = fetch_word_list(positive_url)
lexicon_negative = fetch_word_list(negative_url)

# Create lexicon positive in English
lexicon_positive_en = set([
    'admire',
    'amazing',
    'appreciate',
    'awesome',
    'best',
    'brave',
    'congratulations',
    'congrats',
    'enjoy',
    'energic',
    'excellent',
    'fantastic',
    'fun',
    'good',
    'great',
    'happy',
    'hot',
    'love',
    'new',
    'nice',
    'satisfied',
    'sexy',
    'success',
    'superb',
    'wonderful',
])

# Create lexicon negative in English
lexicon_negative_en = set([
    'angry',
    'awful',
    'bad',
    'disappointing',
    'failed',
    'hate',
    'horrible',
    'lies',
    'lost',
    'nasty',
    'pathetic',
    'regret',
    'sad',
    'sin',
    'shame',
    'stupid',
    'terrible',
    'ugly',
    'worst',
])

lexicon_positive.update(lexicon_positive_en)
lexicon_negative.update(lexicon_negative_en)

In [33]:
sorted(lexicon_positive)

['Dukung',
 'Kualitas terbaik',
 'Lebih memilih',
 'Penghargaan',
 'Plus',
 'Selamat',
 'TOP',
 'WOW',
 'Whooooa',
 'Wow',
 'a+',
 'acungan jempol',
 'adaptif',
 'adil',
 'admire',
 'afinitas',
 'afirmasi',
 'agilely',
 'agung',
 'ahli',
 'ahlinya',
 'ajaib',
 'aklamasi',
 'akomodatif',
 'akurat',
 'alam mimpi',
 'alhamdulillah',
 'allahu akbar',
 'altruistis',
 'aman',
 'amanah',
 'amat',
 'amazing',
 'ambisius',
 'andal',
 'aneh',
 'anggun',
 'angin sepoi-sepoi',
 'angkat',
 'antusias',
 'antusiasme',
 'apik',
 'appreciate',
 'apresiasi',
 'asli',
 'aspirasi',
 'asyik',
 'awesome',
 'bagos',
 'bagus',
 'bahagia',
 'baik',
 'baik diposisikan',
 'baik sekali',
 'baik-baik',
 'bakat',
 'bangga',
 'bantuan',
 'banyak',
 'banyak akal',
 'barang baru',
 'batu permata',
 'bebas',
 'bebas masalah',
 'bebas pulsa',
 'bebas rasa sakit',
 'bebas resiko',
 'bekerja',
 'bekerja keras',
 'belas kasihan',
 'benar',
 'benar-benar',
 'bengal',
 'beradaptasi',
 'beralasan',
 'berani',
 'berapi',
 'ber

In [34]:
sorted(lexicon_negative)

['Bingung',
 'Gejala',
 'Iluminati',
 'Iritasi',
 'KEBINGUNGAN',
 'Keheranan',
 'Keluhan',
 'Kerugian',
 'Ketidaktelitian',
 'Maaf',
 'Membantah',
 'Mengutuk',
 'PHK',
 'Pembobolan',
 'Radikal',
 'Salah',
 'Serangan',
 'Tuduhan',
 'Tukang onar',
 'Tumbang',
 'Tunggul',
 'Yahudi',
 'abnormal',
 'absurd',
 'acak',
 'acak-acakan',
 'acuh',
 'acuh tak acuh',
 'adiktif',
 'adil',
 'agresi',
 'agresif',
 'agresor',
 'aib',
 'air terjun',
 'akurat',
 'alarm',
 'alasan',
 'alat permainan',
 'alergi',
 'alergik',
 'amat ketakutan',
 'amat panas',
 'ambigu',
 'ambivalen',
 'ambivalensi',
 'amoral',
 'amoralitas',
 'ampun',
 'amuk',
 'anak nakal',
 'anak yatim',
 'anarki',
 'anarkis',
 'anarkisme',
 'ancaman',
 'aneh',
 'aneh lagi',
 'anehnya',
 'angkuh',
 'angriness',
 'angry',
 'anjing',
 'anjlok',
 'anomali',
 'antagonis',
 'antagonisme',
 'antek',
 'anti-',
 'anti-Amerika',
 'anti-Israel',
 'anti-Semit',
 'anti-kita',
 'anti-pendudukan',
 'anti-proliferasi',
 'anti-putih',
 'antipati',
 'anti

In [35]:
# Function for sentiment analysis using ID-OpinionWords
def analyze_sentiment(text):
    words = text.split()
    score = 0

    for word in words:
        if word in lexicon_positive:
            score += 1  # Increment count for positive words
        elif word in lexicon_negative:
            score -= 1  # Increment count for negative words

    # Determine sentiment based on counts
    if score > 0:
        sentiment = "positive"
    elif score < 0:
        sentiment = "negative"
    else:
        sentiment = "neutral"  # Both counts are zero

    return {
        "Sentiment_score": score,  # Simple score based on counts
        "Sentiment": sentiment
    }

In [36]:
example_text = "Gila konsep MV nya keren banget, good job JKT48"
result = analyze_sentiment(example_text)
print(result)

{'Sentiment_score': 2, 'Sentiment': 'positive'}


In [37]:
example_text = "jelek banget videonya"
result = analyze_sentiment(example_text)
print(result)

{'Sentiment_score': -1, 'Sentiment': 'negative'}


In [38]:
example_text = "astaghfirullah mantab"
result = analyze_sentiment(example_text)
print(result)

{'Sentiment_score': 0, 'Sentiment': 'neutral'}


In [39]:
# Apply sentiment analysis function to the 'Comment_clean_words' column with progress bar
tqdm.pandas(desc="Analyzing Sentiment")
comments[['Sentiment_score', 'Sentiment']] = comments['Comment_clean_words'].progress_apply(analyze_sentiment).apply(pd.Series)

Analyzing Sentiment: 100%|██████████| 11650/11650 [00:00<00:00, 322869.01it/s]


In [40]:
comments[['Comment', 'Sentiment_score', 'Sentiment']].head(10)

Unnamed: 0,Comment,Sentiment_score,Sentiment
13367,"Guys, lagu ini bukan tentang LGBT, tapi tentan...",-3,negative
12665,Performance Videonya kaya memberitahu kita ten...,4,positive
20990,Satu persatu member diberikan kesempatan buat...,3,positive
16159,"fiks, kalau kedepan jkt48 release single MVnya...",2,positive
2359,Malam ini rahasia ya\nKamu tak boleh bilang si...,-5,negative
6792,"Terlepas dari kontroversi yang ada, sejujurnya...",2,positive
21119,Terlepas dari hate comen 18+. Jujur ini suatu ...,2,positive
5083,"Gila konsep MV nya keren banget, good job JKT48",1,positive
20771,Congrats JKT48 NEW ERA atas mini albumnya. JKT...,2,positive
1882,"buay yg blg lesbi itu salah besar ya, ini tuh ...",4,positive


## Check Results

In [41]:
# Check Comments with Positive Sentiment
positive_comments = comments[comments['Sentiment'] == 'positive']
positive_comments[['Comment', 'Comment_clean_words', 'Sentiment_score', 'Sentiment']].head(10)

Unnamed: 0,Comment,Comment_clean_words,Sentiment_score,Sentiment
12665,Performance Videonya kaya memberitahu kita ten...,performance videonya kaya memberitahu dampak b...,4,positive
20990,Satu persatu member diberikan kesempatan buat...,persatu member kesempatan menunjukan potensiny...,3,positive
16159,"fiks, kalau kedepan jkt48 release single MVnya...",fiks kedepan jkt48 release single mvnya kostum...,2,positive
6792,"Terlepas dari kontroversi yang ada, sejujurnya...",terlepas kontroversi sejujurnya lagu represent...,2,positive
21119,Terlepas dari hate comen 18+. Jujur ini suatu ...,terlepas hate comen jujur kemajuan banget jkt4...,2,positive
5083,"Gila konsep MV nya keren banget, good job JKT48",gila konsep mv keren banget good job jkt48,1,positive
20771,Congrats JKT48 NEW ERA atas mini albumnya. JKT...,congrats jkt48 new era mini albumnya jkt48 jay...,2,positive
1882,"buay yg blg lesbi itu salah besar ya, ini tuh ...",buay yang bilang lesbi salah tuh makna lumayan...,4,positive
3122,Gila sih ini konsepnya keren banget. 😮💕\n\nMaj...,gila konsepnya keren banget maju jkt48,1,positive
19760,Kekuatan JKT48 memang ada di performance. Kali...,kekuatan jkt48 performance keren banget umur j...,1,positive


In [42]:
# Check Comments with Neutral Sentiment
neutral_comments = comments[comments['Sentiment'] == 'neutral']
neutral_comments[['Comment', 'Comment_clean_words', 'Sentiment_score', 'Sentiment']].head(10)

Unnamed: 0,Comment,Comment_clean_words,Sentiment_score,Sentiment
4759,"Kathirna, Freya, Marsha, Muthe, Ashel.. bersin...",kathirna freya marsha muthe ashel bersinarlah ...,0,neutral
11890,Wow rekor performa Youtube JKT48 pecah semua s...,wow rekor performa youtube jkt48 pecah perform...,0,neutral
17037,"parah sih ini, aransemen musiknya cakep banget...",parah aransemen musiknya cakep banget mvnya me...,0,neutral
5917,"Ayo guys naikin lagi views nyaa, ini ga kalah ...",ayo teman-teman naikin views nyaa kalah bagus ...,0,neutral
12745,Beauty Shoot Moment\n\n2:47 Muthe\n2:50 Kathri...,beauty shoot moment muthe kathrina freya ashel...,0,neutral
17427,Musiknya broo kelass banget... Konsepnya keree...,musiknya broo kelass banget konsepnya kereen b...,0,neutral
9689,"bangga sama kerja keras member dan staff juga,...",bangga kerja keras member staff congrats sudah...,0,neutral
16396,Gila keren bgt om JOT idenya Semoga JKT48 jaya...,gila keren banget om jot idenya semoga jkt48 j...,0,neutral
12790,Jkt48 is getting more and more extraordinary e...,jkt48 getting extraordinary every day,0,neutral
16854,Terlepas dari pro kontra yg ada lagunya emang ...,terlepas profesional kontra yang lagunya emang...,0,neutral


In [43]:
# Check Comments with Negative Sentiment
negative_comments = comments[comments['Sentiment'] == 'negative']
negative_comments[['Comment', 'Comment_clean_words', 'Sentiment_score', 'Sentiment']].head(10)

Unnamed: 0,Comment,Comment_clean_words,Sentiment_score,Sentiment
13367,"Guys, lagu ini bukan tentang LGBT, tapi tentan...",teman-teman lagu lgbt gadis muda yang beranjak...,-3,negative
2359,Malam ini rahasia ya\nKamu tak boleh bilang si...,malam rahasia bilang siapasiapa rahasia ah cah...,-5,negative
11341,"Gila keren parah udah trending 1 aja, dan tren...",gila keren parah sudah trend saja trend dikate...,-1,negative
2061,Malam ini rahasia ya\nKamu tak boleh bilang si...,malam rahasia bilang siapasiapa rahasia ah cah...,-3,negative
18350,Waw!! Lagunya bener-bener plot twist. Gokil ba...,waw lagunya benerbener plot twist gila banget ...,-2,negative
17953,"Diluar semua kontroversi, big applause untuk a...",diluar kontroversi big applause acting member,-1,negative
5935,Malam ini rahasia ya\nKamu tak boleh bilang si...,malam rahasia bilang siapasiapa rahasia ah cah...,-3,negative
5422,"Perbanyak mv masterpiece seperti ini , komplek...",perbanyak mv masterpiece kompleks banget mv,-1,negative
19263,"Congratulations JKT48, bner bner di luar dugaa...",congratulations jkt48 bner bner dugaan banget ...,-1,negative
8125,"Selalu merinding, bangga banget sama kerja ker...",merinding bangga banget kerja keras member sta...,-2,negative


- Secara konteks mungkin terlihat ada beberapa yang miss karena metode lexicon hanya mencocokkan dan menghitung jumlah kata positif dan negatif

In [44]:
comments['Sentiment'].value_counts()

Sentiment
neutral     5872
positive    4498
negative    1280
Name: count, dtype: int64

## Save to CSV

In [45]:
output_path = file_path.replace(".csv", "_labeled_lexicon.csv")
comments.to_csv(output_path, index=False)

print(f"Sentiment-labeled dataset saved to {output_path}")

Sentiment-labeled dataset saved to dataset/oshibe_spv_comments_2025-01-15_labeled_lexicon.csv
