# Import Libraries

In [1]:
import pandas as pd
import re
from IPython.display import clear_output
from tqdm import tqdm
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

# Load Dataset

In [2]:
dir_ = "dataset/"
file_path = dir_ + 'oshibe_spv_comments_2025-01-15.csv'
df = pd.read_csv(file_path)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23008 entries, 0 to 23007
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ID          23008 non-null  object 
 1   ParentID    11091 non-null  object 
 2   Timestamp   23008 non-null  object 
 3   Username    23008 non-null  object 
 4   Comment     23005 non-null  object 
 5   LikeCount   23008 non-null  int64  
 6   ReplyCount  11917 non-null  float64
 7   Date        23008 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 1.4+ MB


In [4]:
# Convert data type of 'ReplyCount' to integer
df['ReplyCount'] = pd.to_numeric(df['ReplyCount'], errors='coerce').astype('Int64')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23008 entries, 0 to 23007
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          23008 non-null  object
 1   ParentID    11091 non-null  object
 2   Timestamp   23008 non-null  object
 3   Username    23008 non-null  object
 4   Comment     23005 non-null  object
 5   LikeCount   23008 non-null  int64 
 6   ReplyCount  11917 non-null  Int64 
 7   Date        23008 non-null  object
dtypes: Int64(1), int64(1), object(6)
memory usage: 1.4+ MB


In [5]:
df[df['Comment'].isnull()]

Unnamed: 0,ID,ParentID,Timestamp,Username,Comment,LikeCount,ReplyCount,Date
301,UgwUen0WTAZIqnC6hKJ4AaABAg,,2024-10-24T02:01:16Z,@ehasitijulaeha2522,,1,0,2024-10-24T02:01:16Z
584,UgwYhw0GzhZZSOTA8il4AaABAg,,2024-08-30T14:41:09Z,@stepme,,0,0,2024-08-30T14:41:09Z
18679,UgyQVu755DqSxX5PTIV4AaABAg,,2023-03-13T20:08:42Z,@ghandithesupremeleader9740,,4,0,2023-03-13T20:08:42Z


- Terdapat 3 baris dengan komentar kosong (kemungkinan hanya berisi karakter yang tidak berhasil di-encode di api call), ketiga baris ini bisa dihapus saja

In [6]:
df.dropna(subset=['Comment'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23005 entries, 0 to 23007
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          23005 non-null  object
 1   ParentID    11091 non-null  object
 2   Timestamp   23005 non-null  object
 3   Username    23005 non-null  object
 4   Comment     23005 non-null  object
 5   LikeCount   23005 non-null  int64 
 6   ReplyCount  11914 non-null  Int64 
 7   Date        23005 non-null  object
dtypes: Int64(1), int64(1), object(6)
memory usage: 1.6+ MB


In [7]:
# Check Top Comments
top_level_comments = df[df['ParentID'].isnull()].sort_values(by=['LikeCount', 'ReplyCount'], ascending=False)
top_level_comments[['Username', 'Comment', 'LikeCount', 'ReplyCount', 'Date']]

Unnamed: 0,Username,Comment,LikeCount,ReplyCount,Date
14134,@onthebluesky,"Guys, lagu ini bukan tentang LGBT, tapi tentan...",19401,751,2023-03-14T11:13:13Z
13411,@driezkh,Performance Videonya kaya memberitahu kita ten...,2319,70,2023-03-20T17:56:27Z
21823,@adanjir1923,Satu persatu member diberikan kesempatan buat...,1885,63,2023-03-13T13:16:02Z
16959,@Jkt48990,"fiks, kalau kedepan jkt48 release single MVnya...",1863,80,2023-03-14T05:24:15Z
2393,@ahmadfikri5186,Malam ini rahasia ya\nKamu tak boleh bilang si...,967,13,2023-10-21T18:45:55Z
...,...,...,...,...,...
22930,@rizalfahri6435,"Apakah Shani jadi center lagi, ataukah dipanta...",0,0,2023-03-13T09:45:54Z
22942,@isnanyusuf3575,infokan,0,0,2023-03-13T09:36:15Z
22988,@johanafandi11,Nitip,0,0,2023-03-13T08:57:49Z
22997,@fahmiaditakurnia3734,ninggalin jejak,0,0,2023-03-13T08:51:32Z


In [8]:
# Check Replies
replies = df[df['ParentID'].notnull()]
replies[['ParentID', 'Comment', 'LikeCount']]

Unnamed: 0,ParentID,Comment,LikeCount
10,UgxCK8DSLpRl2ZWP6pp4AaABAg,"Ini bukan lgbt, ini menceritakan tentang salah...",1
11,UgxCK8DSLpRl2ZWP6pp4AaABAg,@Christyyyy-bt5ps guru gembul aja bilang ini ...,0
14,UgyO59JOKmUo-5QjmLB4AaABAg,Jelas sekali bapak nya ga ngerti lagunya artin...,0
15,UgyO59JOKmUo-5QjmLB4AaABAg,"@Melvinbryanchiri iya si ya, emang liriknya aj...",1
22,Ugwb_ySgHtkmV4N2rzt4AaABAg,Kali ini bakal dari komunitas atau pengikut da...,0
...,...,...,...
22990,UgwbfBy7tSP4XWpmQjx4AaABAg,Tolong hapus emot biar trending,0
22991,UgwbfBy7tSP4XWpmQjx4AaABAg,@@AbdulSalam-xe2cq kenapa woy 😑😑,0
22995,Ugwxd5VGdiMxfkC4Ck14AaABAg,Dh lewat ngav kwkw,0
23002,UgxO7uT6vEj69cqUJNd4AaABAg,Kita sama banget,0


- Sentiment analysis hanya dilakukan pada top_level_comments (tidak termasuk replies/balasan komentar), karena top_level_comments inilah yang ditujukan untuk videonya

In [10]:
# Function to clean text by removing non-ASCII characters and reducing multiple spaces
def clean_text(text):
    # Remove non-ASCII characters
    text = re.sub(r"[^\x00-\x7F]+", '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Strip leading and trailing spaces
    return text.strip()

# Apply the function to the 'Comment' column
top_level_comments['Comment_clean'] = top_level_comments['Comment'].apply(clean_text)
top_level_comments[['Comment', 'Comment_clean']].head(15)

Unnamed: 0,Comment,Comment_clean
14134,"Guys, lagu ini bukan tentang LGBT, tapi tentan...","Guys, lagu ini bukan tentang LGBT, tapi tentan..."
13411,Performance Videonya kaya memberitahu kita ten...,Performance Videonya kaya memberitahu kita ten...
21823,Satu persatu member diberikan kesempatan buat...,Satu persatu member diberikan kesempatan buat ...
16959,"fiks, kalau kedepan jkt48 release single MVnya...","fiks, kalau kedepan jkt48 release single MVnya..."
2393,Malam ini rahasia ya\nKamu tak boleh bilang si...,Malam ini rahasia ya Kamu tak boleh bilang sia...
7362,"Terlepas dari kontroversi yang ada, sejujurnya...","Terlepas dari kontroversi yang ada, sejujurnya..."
21953,Terlepas dari hate comen 18+. Jujur ini suatu ...,Terlepas dari hate comen 18+. Jujur ini suatu ...
5608,"Gila konsep MV nya keren banget, good job JKT48","Gila konsep MV nya keren banget, good job JKT48"
21601,Congrats JKT48 NEW ERA atas mini albumnya. JKT...,Congrats JKT48 NEW ERA atas mini albumnya. JKT...
1897,"buay yg blg lesbi itu salah besar ya, ini tuh ...","buay yg blg lesbi itu salah besar ya, ini tuh ..."


In [11]:
# Calculate the size of each comment and create a new column 'Comment_size'
top_level_comments['Comment_size'] = top_level_comments['Comment'].apply(len)
top_level_comments['Comment_size'].describe()

count    11914.000000
mean        64.357059
std        136.256981
min          1.000000
25%         17.000000
50%         31.000000
75%         65.000000
max       4545.000000
Name: Comment_size, dtype: float64

In [12]:
# Filter for rows where Comment_size is greater than 512
long_comments = top_level_comments[top_level_comments['Comment_size'] > 512]
len(long_comments)

122

- Terdapat 122 komentar yang akan dilabeli secara manual atau dengan bantuan genAI karena size nya melebihi limit yang bisa dihandle pre-trained model yang akan digunakan dalam data labelling

# Data Labelling with pre-trained BERT Model

In [13]:
# Load the model and tokenizer
pretrained = "mdhugol/indonesia-bert-sentiment-classification"
model = AutoModelForSequenceClassification.from_pretrained(pretrained)
tokenizer = AutoTokenizer.from_pretrained(pretrained)
sentiment_analysis = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Label mapping
label_index = {'LABEL_0': 'positive', 'LABEL_1': 'neutral', 'LABEL_2': 'negative'}

# Function to analyze sentiment
def analyze_sentiment(text):
    # Skip analysis if text length is greater than 512 characters
    if len(text) > 512:
        return "", 0

    # Perform sentiment analysis for comments within the limit
    result = sentiment_analysis(text)
    label = label_index[result[0]['label']]
    score = result[0]['score']
    
    return label, score

# Apply sentiment analysis with progress bar
tqdm.pandas(desc="Analyzing Sentiment")
top_level_comments[['Sentiment', 'Confidence']] = top_level_comments['Comment_clean'].progress_apply(analyze_sentiment).apply(pd.Series)


Device set to use cpu
Analyzing Sentiment: 100%|██████████| 11914/11914 [30:35<00:00,  6.49it/s] 


In [14]:
top_level_comments[['Comment', 'Sentiment', 'Confidence']].head(10)

Unnamed: 0,Comment,Sentiment,Confidence
14134,"Guys, lagu ini bukan tentang LGBT, tapi tentan...",,0.0
13411,Performance Videonya kaya memberitahu kita ten...,,0.0
21823,Satu persatu member diberikan kesempatan buat...,positive,0.972226
16959,"fiks, kalau kedepan jkt48 release single MVnya...",positive,0.996596
2393,Malam ini rahasia ya\nKamu tak boleh bilang si...,,0.0
7362,"Terlepas dari kontroversi yang ada, sejujurnya...",,0.0
21953,Terlepas dari hate comen 18+. Jujur ini suatu ...,positive,0.994971
5608,"Gila konsep MV nya keren banget, good job JKT48",positive,0.996474
21601,Congrats JKT48 NEW ERA atas mini albumnya. JKT...,positive,0.574264
1897,"buay yg blg lesbi itu salah besar ya, ini tuh ...",,0.0


- Masih terdapat komentar dengan sentiment yang belum terisi (sengaja dilewati karena melebihi batas panjang karakter untuk pre-trained model yang digunakan)
- Sentiment yang masih kosong ini akan diisi secara manual

In [15]:
# Check Comments with Neutral Sentiment
neutral_comments = top_level_comments[top_level_comments['Sentiment'] == 'neutral']
neutral_comments[['Comment', 'Sentiment', 'Confidence']].head(10)

Unnamed: 0,Comment,Sentiment,Confidence
16392,Ini lagu ttg pelajaran IPA. Benang sari (janta...,neutral,0.434662
20156,Three words for this new special performance i...,neutral,0.982986
21908,BRAVO JKT48 NEW ERA!!!,neutral,0.968161
19226,Akhirnya ada mini album lagi setelah sekian la...,neutral,0.773497
13491,Beauty Shoot Moment\n\n2:47 Muthe\n2:50 Kathri...,neutral,0.988937
20346,JKT48 new era\nNew konsep\nNew aransemen\nCent...,neutral,0.530215
7532,Saatnya bahas vokal: di sini harmonisasi merek...,neutral,0.629304
13538,Jkt48 is getting more and more extraordinary e...,neutral,0.695949
17181,Alhamdulillah trending ... Yuk bisa lebih lagi...,neutral,0.736706
18038,진짜 멋있다 JKT48 미니앨범 축하해,neutral,0.660809


In [16]:
# Check Comments with Negative Sentiment
negative_comments = top_level_comments[top_level_comments['Sentiment'] == 'negative']
negative_comments[['Comment', 'Sentiment', 'Confidence']].head(10)

Unnamed: 0,Comment,Sentiment,Confidence
15824,"Ini jujur, TERLALU INDAH. bagus banget JOT! Da...",negative,0.765288
12060,"Gila keren parah udah trending 1 aja, dan tren...",negative,0.981162
7764,Kalo dari judul dan lirik sendiri sebenarnya i...,negative,0.939781
18771,"Diluar semua kontroversi, big applause untuk a...",negative,0.775857
5143,Media bilang ini single baru padahal cuman spe...,negative,0.988137
5959,"Perbanyak mv masterpiece seperti ini , komplek...",negative,0.508472
7730,Makin lama makin di dengar makin bgus lebih masuk,negative,0.564185
19458,"Woaaahh kaget dong, keren banget ini mah. Sema...",negative,0.732071
6589,JKT48 KOK JADI KEREN GINI WOY???! DARI MANA SA...,negative,0.989798
10234,Ampunn JKT48 SEKARANG SEBAGUS INI 😍,negative,0.995161


In [17]:
# Function to label sentiment manually for long comments
def label_sentiment(long_comments):
    total_comments = len(long_comments)
    i = 1
    for index, row in long_comments.iterrows():
        text = row['Comment_clean']
        while True:
            # Clear the previous outputs
            clear_output(wait=False)
            
            # Display the current progress
            print(f"Labeling comment {i}/{total_comments}:")
            print(f"Comment: \n{text}")
            sentiment = input("Please enter 1 for positive, 0 for neutral, -1 for negative, then hit ENTER: ")
            print(f"=========================================")
            
            # Convert input to integer and handle possible errors
            try:
                sentiment = int(sentiment)
                # Map input to label
                if sentiment == 1:
                    top_level_comments.at[index, 'Sentiment'] = "positive"
                    top_level_comments.at[index, 'Confidence'] = 0
                    i += 1
                    break
                elif sentiment == 0:
                    top_level_comments.at[index, 'Sentiment'] = "neutral"
                    top_level_comments.at[index, 'Confidence'] = 0
                    i += 1
                    break
                elif sentiment == -1:
                    top_level_comments.at[index, 'Sentiment'] = "negative"
                    top_level_comments.at[index, 'Confidence'] = 0
                    i += 1
                    break
                else:
                    print("Invalid input. Please enter 1, 0, or -1.")
            except ValueError:
                print("Invalid input. Please enter a numeric value (1, 0, or -1).")

# Call the labeling function for long comments
label_sentiment(long_comments)

Labeling comment 122/122:
Comment: 
Malam ini rahasia ya Kamu tak boleh bilang siapa-siapa Datang ke sini juga rahasia Terus sekarang mau apa? Ah, di cahaya bulan (Misterius) Ah, awan menghiasi (Mengundang) Ayo ke sini Apakah yang kau mau? Ke taman bunga para gadis Rasa madu adalah rahasianya Ya (Ya) Ini janji yang terlarang Ketakutan (Juga was-was) Terasa, 'kan? (Ya, terasa) Permainan berbahaya Tiba-tiba saling memandang Kenapa? (Kenapa?) Hanya terdiam saja, kah? Tangan yang (Diulurkan) Bersentuhan (Kita bagai) Sari bunga dan kupu-kupu malam Aku tak biasa punya rahasia Nanti pasti 'kan dimarahi mama Rahasia bukanlah masalah Nikmati sajalah saat ini Ah, bunga yang tak pernah (Aku lihat) Ah, aromanya itu (Manisnya) Jangan menggodaku Akan aku ajari Di taman bunga para gadis Cinta itu 'kan selalu immoral Hei (Hei) Bibir yang mulai mendekat Tidak boleh (Ah, boleh lah) Jangan ah (Ih, lucunya) Kita telah melewati Batasnya dan saling mencinta Sudah (Sudah) Nafas pun menjadi panjang Panas seka

In [18]:
top_level_comments[['Comment', 'Sentiment', 'Confidence']].head()

Unnamed: 0,Comment,Sentiment,Confidence
14134,"Guys, lagu ini bukan tentang LGBT, tapi tentan...",positive,0.0
13411,Performance Videonya kaya memberitahu kita ten...,positive,0.0
21823,Satu persatu member diberikan kesempatan buat...,positive,0.972226
16959,"fiks, kalau kedepan jkt48 release single MVnya...",positive,0.996596
2393,Malam ini rahasia ya\nKamu tak boleh bilang si...,positive,0.0


In [19]:
top_level_comments[top_level_comments['Sentiment'].isnull()]

Unnamed: 0,ID,ParentID,Timestamp,Username,Comment,LikeCount,ReplyCount,Date,Comment_clean,Comment_size,Sentiment,Confidence


- Sudah tidak ada Sentiment yang kosong

In [20]:
# Save the labeled dataset
output_path = file_path.replace(".csv", "_labeled.csv")
top_level_comments.to_csv(output_path)

print(f"Sentiment-labeled dataset saved to {output_path}")

Sentiment-labeled dataset saved to dataset/oshibe_spv_comments_2025-01-15_labeled.csv


# Sentiment Analysis

# Feature Extraction

# Model Building