#(LSA Topic Modelling)

##Crawling Youtube

In [None]:
!pip install selenium --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m384.9/384.9 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import time
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
import pandas as pd
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

def get_youtube_comments(video_id, api_key):
    youtube = build('youtube', 'v3', developerKey=api_key)

    try:
        # Mengirim permintaan untuk mendapatkan komentar video
        response = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            maxResults=1000,  # Jumlah maksimum komentar yang diambil
            textFormat='plainText'
        ).execute()

        comments = []
        while response:
            for item in response['items']:
                comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
                comments.append(comment)

            # Memeriksa jika ada halaman selanjutnya
            if 'nextPageToken' in response:
                response = youtube.commentThreads().list(
                    part='snippet',
                    videoId=video_id,
                    maxResults=1000,
                    textFormat='plainText',
                    pageToken=response['nextPageToken']
                ).execute()
            else:
                break

        return comments

    except HttpError as e:
        print(f'Error: {e}')

def save_comments_to_excel(comments, file_path):
    df = pd.DataFrame(comments, columns=['Comments'])
    df.to_csv(file_path, index=False)

# Mengatur API key dan ID video
api_key = 'AIzaSyApFU3Frqpqn6s78Es-hwDDq7TP8K1CRBY'
video_id = 'KtntKGlmuZw'

# Mendapatkan komentar dari YouTube Data API
comments = get_youtube_comments(video_id, api_key)

# Menyimpan komentar dalam file Excel
file_path = 'youtube_comment.csv'
save_comments_to_excel(comments, file_path)


In [None]:
import re, string
import numpy as np

# Text Cleaning
def cleaning(text):
    # HTML Tag Removal

    text = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});').sub('', str(text))

    # Case folding
    text = text.lower()

    # Trim text
    text = text.strip()

    # Remove punctuations, karakter spesial, and spasi ganda
    text = re.compile('<.*?>').sub('', text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text = re.sub('\s+', ' ', text)

    # Number removal
    text = re.sub(r'\[[0-9]*\]', ' ', text)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d', ' ', text)
    text = re.sub(r'\s+', ' ', text)

    # Mengubah text 'nan' dengan whitespace agar nantinya dapat dihapus
    text = re.sub('nan', '', text)

    return text

##LSA Topik Modeling

###Read Data

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
df=pd.read_csv('https://raw.githubusercontent.com/billymorgen/dataset/main/youtube_comment.csv')
df

Unnamed: 0,Comments
0,Apakah diantara mereka ada yg berpidato tentan...
1,"ANTARA ANIS ATAU PRABOWO AJA DEH, BUKAN DARI Y..."
2,"Kalo bener 4 capres bakal maju, rakyat indones..."
3,"selama mak banteng masih mentereng, susah buat..."
4,Presiden harus orang Jawa asli ganjar pranowo ...
...,...
1382,Mending balikin lagi ke dpr buat pilih preside...
1383,Capret calon kampret
1384,"Cuma ada 2 paslon, Ganjar (PDI-P, PPP, HANURA,..."
1385,Gua nyapres juga ah.\nCarany gimana sih gan-ag...


###Modeling

In [None]:
import nltk
nltk.download('stopwords', quiet=True)

True

In [None]:
from nltk.tokenize import RegexpTokenizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
stopwords = stopwords.words('indonesian')

# Membentuk matriks dokumen x kata
tokenizer = RegexpTokenizer(r'\w+')
vectorizer = TfidfVectorizer(lowercase=True,
                        stop_words=stopwords,
                        tokenizer = tokenizer.tokenize)

tfidf_matrix = vectorizer.fit_transform(df['Comments'])

# Melakukan dekomposisi matriks dengan SVD
svd_model = TruncatedSVD(n_components=4)
lsa_matrix = svd_model.fit_transform(tfidf_matrix)

###Bobot Kata Terhadap Topik

In [None]:
# bobot kata terhadap masing masing topik
terms = vectorizer.get_feature_names_out()

for index, component in enumerate(svd_model.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:2]
    print("Topic "+str(index)+": ",top_terms_key)

Topic 0:  [('prabowo', 0.5326979432934852), ('pilih', 0.31263863056508123)]
Topic 1:  [('prabowo', 0.7781893102076874), ('mahfud', 0.06957067219156735)]
Topic 2:  [('anies', 0.703828102448258), ('baswedan', 0.31330451135932075)]
Topic 3:  [('anis', 0.677602350099083), ('ganjar', 0.28789774870791524)]


###Bobot Setiap Topik Terhadap Dokumen

In [None]:
# bobot setiap topik terhadap  dokumen
df_lsa = pd.DataFrame(lsa_matrix, columns=["Topik 0", "Topik 1", "Topik 2", "Topik 3"])
df_lsa = pd.concat([df["Comments"], df_lsa], axis=1)
df_lsa['Topik']= df_lsa[['Topik 0', 'Topik 1', 'Topik 2', 'Topik 3']].apply(lambda x: x.argmax(), axis=1)

df_lsa

Unnamed: 0,Comments,Topik 0,Topik 1,Topik 2,Topik 3,Topik
0,Apakah diantara mereka ada yg berpidato tentan...,0.055630,-0.032881,-0.007068,0.014140,0
1,"ANTARA ANIS ATAU PRABOWO AJA DEH, BUKAN DARI Y...",0.367183,0.134961,0.077692,0.286763,0
2,"Kalo bener 4 capres bakal maju, rakyat indones...",0.116113,-0.072824,0.001360,0.024628,0
3,"selama mak banteng masih mentereng, susah buat...",0.020705,-0.013969,-0.004637,-0.003093,0
4,Presiden harus orang Jawa asli ganjar pranowo ...,0.113704,-0.031043,0.025905,0.069285,0
...,...,...,...,...,...,...
1382,Mending balikin lagi ke dpr buat pilih preside...,0.184176,-0.090824,-0.002779,0.026413,0
1383,Capret calon kampret,0.021041,-0.009816,0.007532,0.006664,0
1384,"Cuma ada 2 paslon, Ganjar (PDI-P, PPP, HANURA,...",0.174693,0.047662,0.089633,-0.023258,0
1385,Gua nyapres juga ah.\nCarany gimana sih gan-ag...,0.016710,-0.005046,0.004034,0.009179,0


In [None]:
df_lsa['Topik'].value_counts()

0    1021
3     155
2     130
1      81
Name: Topik, dtype: int64