In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from datetime import datetime

In [52]:
def custom_strptime(date_str):
    months = {
        'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04',
        'Mei': '05', 'Jun': '06', 'Jul': '07', 'Agu': '08', 'Ags': '08', 'Sep': '09', 'Okt': '10', 'Nov': '11', 'Des': '12'
    }
    date_parts = date_str.split()
    return datetime.strptime(f"{date_parts[0]} {months[date_parts[1]]} {date_parts[2]} {date_parts[3]}", '%d %m %Y %H:%M')

In [53]:
def scrape_detik_headlines(query, total_pages=4):
    HEADERS = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
        'Accept-Language': 'en-US, en;q=0.5'
    }

    # Buat list untuk menyimpan data
    data = []

    base_url = f"https://www.detik.com/search/searchall?query={query}&sortby=time&page={{}}"

    for page in range(1, total_pages + 1):
        URL = base_url.format(page)
        webpage = requests.get(URL, headers=HEADERS)
        soup = BeautifulSoup(webpage.content, "lxml")
        articles = soup.find_all('article')

        for article in articles:
            link = article.find('a')['href']
            judul = article.find('h2', attrs={'class': 'title'}).string.strip()
            date = article.find('span', attrs={'class': 'date'}).get_text(strip=True)
            date = date.split(",")[1].strip()
            # Menggunakan fungsi kustom untuk mengonversi format tanggal
            date_obj = custom_strptime(date)
            summary = article.find('p').string.strip()

            # Menambahkan data ke list
            data.append({"url": link, "title": judul, "date": date_obj, "summary": summary})

    # Membuat DataFrame dari list data
    df = pd.DataFrame(data)

    # Menyimpan DataFrame ke dalam file CSV (opsional)
    df.to_csv(f"{query}_headlines.csv", index=False, encoding='utf-8')

    return df

In [54]:
# Contoh penggunaan fungsi:
query = "kota+tasikmalaya"
total_pages = 20
df = scrape_detik_headlines(query, total_pages)
df.head()

Unnamed: 0,url,title,date,summary
0,https://www.detik.com/jabar/berita/d-6954906/v...,Viral Foto Siswa SMP di Tasik Injak Kepala Tem...,2023-09-28 18:00:00,Warga Tasikmalaya dihebohkan dengan beredarnya...
1,https://www.detik.com/jabar/hukum-dan-kriminal...,Polisi Ungkap Aplikasi Walla Pemicu Pelajar Ba...,2023-09-28 14:30:00,Kasus pelajar Bandung jadi korban sodomi 2 pri...
2,https://www.detik.com/jabar/hukum-dan-kriminal...,Kocar-kacir Pejudi Adu Muncang di Tasikmalaya ...,2023-09-28 11:41:00,Polisi menggerebek arena permainan adu kemiri ...
3,https://www.detik.com/jabar/hukum-dan-kriminal...,"Pura-pura Beli Test Pack, Maling Gondol 2 Kota...",2023-09-28 01:30:00,Dua buah kotak amal di apotek Padayungan Kota ...
4,https://www.detik.com/jabar/hukum-dan-kriminal...,Jejak Kriminal Bule Amerika Tusuk Mati Mertua ...,2023-09-27 17:45:00,Bule asal Amerika Serikat Arthur Leigh Welohr ...


In [56]:
df.to_excel("data/berita_detik.xlsx", index=False)

In [63]:
df

Unnamed: 0,url,title,date,summary
0,https://www.detik.com/jabar/berita/d-6954906/v...,Viral Foto Siswa SMP di Tasik Injak Kepala Tem...,2023-09-28 18:00:00,Warga Tasikmalaya dihebohkan dengan beredarnya...
1,https://www.detik.com/jabar/hukum-dan-kriminal...,Polisi Ungkap Aplikasi Walla Pemicu Pelajar Ba...,2023-09-28 14:30:00,Kasus pelajar Bandung jadi korban sodomi 2 pri...
2,https://www.detik.com/jabar/hukum-dan-kriminal...,Kocar-kacir Pejudi Adu Muncang di Tasikmalaya ...,2023-09-28 11:41:00,Polisi menggerebek arena permainan adu kemiri ...
3,https://www.detik.com/jabar/hukum-dan-kriminal...,"Pura-pura Beli Test Pack, Maling Gondol 2 Kota...",2023-09-28 01:30:00,Dua buah kotak amal di apotek Padayungan Kota ...
4,https://www.detik.com/jabar/hukum-dan-kriminal...,Jejak Kriminal Bule Amerika Tusuk Mati Mertua ...,2023-09-27 17:45:00,Bule asal Amerika Serikat Arthur Leigh Welohr ...
...,...,...,...,...
175,https://www.detik.com/jabar/berita/d-6851824/1...,16 Kepala Daerah di Jabar Bakal Lengser Akhir ...,2023-08-01 06:45:00,Sebanyak 16 kepala daerah di Jabar akan habis ...
176,https://www.detik.com/jabar/berita/d-6851655/j...,Jabar Hari Ini: Viral Ritual Syiah di Bandung,2023-07-31 22:10:00,Beragam peristiwa terjadi di Jawa Barat hari i...
177,https://www.detik.com/jabar/berita/d-6851249/h...,Hasil Evaluasi Disdik Tasik Soal Pungutan Sisw...,2023-07-31 20:00:00,Dinas Pendidikan Kota Tasikmalaya telah selesa...
178,https://www.detik.com/jabar/berita/d-6850758/r...,Ridwan Kamil Usulkan Nama PJ Walkot Bandung ke...,2023-07-31 17:00:00,Gubernur Jabar Ridwan Kamil telah mengusulkan ...


Topic Modelling

In [59]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/superapp-
[nltk_data]     research/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [60]:
import re, string, unicodedata  #modul regular expression
import nltk
from nltk import word_tokenize, sent_tokenize  #Paket ini membagi teks input menjadi kata-kata.,                                  
from nltk.corpus import stopwords

In [62]:
#preprocessing
def removeStopword(str):
    stop_words = set(stopwords.words('indonesian'))
    word_tokens = word_tokenize(str)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    return ' '.join(filtered_sentence)
#remove sentence which contains only one word
def removeSentence(str): 
    word = str.split()
    wordCount = len(word)
    if(wordCount<=1):
        str = ''
    
    return str
def cleaning(str):
    #remove non-ascii
    str = unicodedata.normalize('NFKD', str).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    #remove URLs
    str = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', str)
    #remove punctuations
    str = re.sub(r'[^\w]|_',' ',str)
    #remove digit from string
    str = re.sub("\S*\d\S*", "", str).strip()
    #remove digit or numbers
    str = re.sub(r"\b\d+\b", " ", str)
    #to lowercase
    str = str.lower()
    #Remove additional white spaces
    str = re.sub('[\s]+', ' ', str)
       
    return str
def preprocessing(str):
    str = removeSentence(str)
    str = cleaning(str)
    str = removeStopword(str)
    
    return str

In [65]:
#test the code
sentences = ["dimana lokasi kuliner tasikmalaya yang murah","alamat tasik dimana sih, yang enak","s"]
for st in sentences:
    r = preprocessing(st)
    print(r)

dimana lokasi kuliner murah
alamat dimana sih enak



In [68]:
import pandas as pd

# Perform preprocessing on the 'Pertanyaan' column
txt = df['summary'].apply(preprocessing)

# Save the preprocessed data to a new CSV file
txt.to_csv('clean-data.csv', index=False)

In [69]:
txt

0      warga dihebohkan beredarnya foto siswa smp men...
1      pelajar bandung korban sodomi pria percakapan ...
2      polisi menggerebek arena permainan adu kemiri ...
3      buah kotak amal apotek padayungan kota digondo...
4      bule amerika serikat arthur leigh welohr ditah...
                             ...                        
175          kepala daerah jabar habis jabatannya daerah
176    beragam peristiwa jawa barat senin viral ritua...
177    dinas pendidikan kota selesai evaluasi punguta...
178    gubernur jabar ridwan kamil mengusulkan nama p...
179    massa guru madrasah kota menggelar aksi demons...
Name: summary, Length: 180, dtype: object