In [1]:
import csv
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
# Function to write data to CSV file

def write_to_csv(data, filename):
    with open(filename, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Write header row
        writer.writerow(['title', 'link', 'date', 'content', 'is_fake'])
        # Write data rows
        for row in data:
            writer.writerow(row)

In [None]:
# KOMPAS.COM News Scraping for HOAX News

url = 'https://www.kompas.com/cekfakta/hoaks-atau-fakta'

# Initialize list to store data
data = []

for i in range(251, 295):
  url_page = url+'/{}'.format(i)
  page = requests.get(url_page)
  soup = BeautifulSoup(page.text, 'html')

  print(i)

  # Find all news articles
  news = soup.find_all('div', 'col-bs9-3')

  for n in news:
    title_category = n.find('h1', 'cekfakta-list-title').get_text()
    link = n.find('a', 'cekfakta-list-link')['href']
    date_str = n.find('p', 'cekfakta-text-date').get_text().split(',')
    date = datetime.strptime(date_str[0].strip(), '%d/%m/%Y').date()

    # Split title and category
    part_title = title_category.split(']')
    if len(part_title) == 2:
      category = part_title[0].strip('[').strip()
      title = part_title[1].strip()

      if category == "KLARIFIKASI" and link is not None:
        # Get news contents
        get_content = requests.get(link)
        soup_content = BeautifulSoup(get_content.text, 'html')
        content = soup_content.find('div', 'read__content').find_all('p')
        paragraph = ''
        for p in content:
          paragraph += p.get_text()

        # News information source
        is_fake = 0

        # Append data to list
        data.append([title, link, date, paragraph, is_fake])

# File name for the CSV
filename = 'dataset_kompas_klarifikasi_5.csv'

# Write data to CSV file
write_to_csv(data, filename)

print(f"Data has been written to {filename}")

In [22]:
# Read CSV file
df = pd.read_csv('/content/dataset_kompas_klarifikasi_5.csv')
df.head()

Unnamed: 0,title,link,date,content,is_fake
0,Ferdy Sambo Mengaku di Balik Penembakan Brigad...,https://www.kompas.com/cekfakta/read/2022/07/2...,2022-07-21,"Berdasarkan verifikasi Kompas.com sejauh ini, ...",0
1,Jemaah Haji Dilarang Membawa Air Zamzam,https://www.kompas.com/cekfakta/read/2022/07/2...,2022-07-21,"Berdasarkan verifikasi Kompas.com sejauh ini, ...",0
2,Irjen Ferdy Sambo Belum Dicopot dari Jabatan K...,https://www.kompas.com/cekfakta/read/2022/07/1...,2022-07-18,"Berdasarkan verifikasi Kompas.com sejauh ini, ...",0
3,Video Jemaah Haji Tewas karena Berdesakan di T...,https://www.kompas.com/cekfakta/read/2022/07/1...,2022-07-14,"Berdasarkan verifikasi Kompas.com sejauh ini, ...",0
4,Timnas U19 Indonesia Lolos ke Semifinal Piala AFF,https://www.kompas.com/cekfakta/read/2022/07/1...,2022-07-13,"Berdasarkan verifikasi Kompas.com sejauh ini, ...",0


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67 entries, 0 to 66
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    67 non-null     object
 1   link     67 non-null     object
 2   date     67 non-null     object
 3   content  67 non-null     object
 4   is_fake  67 non-null     int64 
dtypes: int64(1), object(4)
memory usage: 2.7+ KB


In [55]:
df1 = "dataset_kompas_klarifikasi_1.csv"
df2 = "dataset_kompas_klarifikasi_2.csv"
df3 = "dataset_kompas_klarifikasi_3.csv"
df4 = "dataset_kompas_klarifikasi_4.csv"
df5 = "dataset_kompas_klarifikasi_5.csv"

csv_files = [df1, df2, df3, df4, df5]
data_frames = [pd.read_csv(file) for file in csv_files]
df = pd.concat(data_frames, axis=0, ignore_index=True)

df.to_csv("dataset_kompas_klarifikasi_full.csv")

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 527 entries, 0 to 526
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    527 non-null    object
 1   link     527 non-null    object
 2   date     527 non-null    object
 3   content  527 non-null    object
 4   is_fake  527 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 20.7+ KB


In [76]:
import re

def extract_content(text):
    # Normalize text to lowercase
    # text = text.lower()

    # Define the start and end phrases
    start_phrase = "KOMPAS.com - "

    # Initialize the pattern with the start phrase
    pattern = re.escape(start_phrase) + r"(.*)"

    # Use regular expressions to find the content between the start and end phrases
    match = re.search(pattern, text, re.DOTALL)

    if match:
        # Extract and return the found content
        return match.group(1).strip()
    else:
        return ' '

# Open the CSV file in read mode
with open('dataset_kompas_klarifikasi_full.csv', 'r') as csvfile:
    reader = csv.DictReader(csvfile)

    # Create a list to store the modified data
    modified_data = []

    # Iterate over each row in the CSV file
    for row in reader:
        title = row['title']
        link = row['link']
        date = row['date']
        content = row['content']
        is_fake = row['is_fake']

        extracted_content = extract_content(content)

        # Append data to list
        modified_data.append([title, link, date, extracted_content, is_fake])

    # File name for the CSV
    filename = 'dataset_kompas_klarifikasi.csv'

    # Write data to CSV file
    write_to_csv(modified_data, filename)

    print(f"Data has been written to {filename}")

Data has been written to dataset_kompas_klarifikasi.csv


In [77]:
dfc = pd.read_csv("dataset_kompas_klarifikasi.csv")
dfc.info()
dfc.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 527 entries, 0 to 526
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    527 non-null    object
 1   link     527 non-null    object
 2   date     527 non-null    object
 3   content  527 non-null    object
 4   is_fake  527 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 20.7+ KB


Unnamed: 0,title,link,date,content,is_fake
522,Video 1 Juta Dosis Vaksin Dihancurkan di Nigeria,https://www.kompas.com/cekfakta/read/2022/01/2...,2022-01-26,"Beredar video di media sosial, yang menunjukka...",0
523,"Singapura Tolak Perjanjian MLA, Pasokan Gas Al...",https://www.kompas.com/cekfakta/read/2022/01/2...,2022-01-24,Beredar sebuah video di media sosial Facebook ...,0
524,Harga Minyak Goreng Tembus Rp 718.000 per Liter,https://www.kompas.com/cekfakta/read/2022/01/2...,2022-01-22,Beredar informasi di Facebook yang menyebutkan...,0
525,Delegasi Indonesia ke Israel Bahas Penanganan ...,https://www.kompas.com/cekfakta/read/2022/01/1...,2022-01-19,Beredar kabar bahwa delegasi Indonesia berkunj...,0
526,Pria Hamil dan Melahirkan Anak,https://www.kompas.com/cekfakta/read/2022/01/1...,2022-01-17,Beredar informasi keliru tentang seorang pria ...,0
