# Step 1 - Scraping All Headlines

## Target HTML Elements

Start = https://turnbackhoax.id/page/39

End = https://turnbackhoax.id/page/226


Div = `<div class="mh-loop-content mh-clearfix">`

Title = `<h3 class="entry-title mh-loop-title">`

URL = `<a href>`

`<div class="mh-meta mh-loop-meta">`

Date = `<span class="mh-meta-date updated"> <i class="far fa-clock">`

Author = `<span class="mh-meta-author author vcard"> <a class="fn">`


Preview = `<class="mh-excerpt"> <p>`

Image = `<figure class="mh-loop-thumb"> <a href=>`

## Headlines Scraper

### Basic Imports

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm
import re

### Scrape Headlines

In [None]:
def scrape_turnbackhoax_page(page_number, headers):
    url = f"https://turnbackhoax.id/page/{page_number}/"

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        data = {
            'title': [],
            'url': [],
            'preview': [],
            'image_url': [],
            'date': [],
            'author': []
        }

        articles = soup.find_all('article', class_='mh-loop-item')

        for article in articles:
            # Extract title and URL
            title_element = article.find('h3', class_='entry-title')
            if title_element and title_element.a:
                data['title'].append(title_element.a.text.strip())
                data['url'].append(title_element.a['href'])
            else:
                data['title'].append('')
                data['url'].append('')

            # Extract preview text
            preview_element = article.find('div', class_='mh-excerpt')
            if preview_element and preview_element.p:
                preview_text = preview_element.p.text.split('[…]')[0].strip()
                data['preview'].append(preview_text)
            else:
                data['preview'].append('')

            # Extract image URL
            figure_element = article.find('figure', class_='mh-loop-thumb')
            if figure_element and figure_element.a and figure_element.a.img:
                data['image_url'].append(figure_element.a.img['src'])
            else:
                data['image_url'].append('')

            # Extract date
            date_element = article.find('span', class_='mh-meta-date')
            if date_element:
                data['date'].append(date_element.text.strip())
            else:
                data['date'].append('')

            # Extract author
            author_element = article.find('span', class_='mh-meta-author')
            if author_element and author_element.a:
                data['author'].append(author_element.a.text.strip())
            else:
                data['author'].append('')

        return pd.DataFrame(data)

    except requests.RequestException as e:
        print(f"\nError on page {page_number}: {e}")
        return None
    except Exception as e:
        print(f"\nUnexpected error on page {page_number}: {e}")
        return None

In [None]:
def scrape_all_pages():
    # Headers to mimic browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    # Initialize empty list to store DataFrames from each page
    all_data = []

    # Define start and end page numbers
    start_page = 39
    end_page = 226

    # Create progress bar
    pbar = tqdm(range(start_page, end_page + 1), desc="Scraping pages")

    for page_num in pbar:
        # Update progress bar description
        pbar.set_description(f"Scraping page {page_num}")

        # Scrape the current page
        df = scrape_turnbackhoax_page(page_num, headers)

        if df is not None and not df.empty:
            # Add page number column
            df['page_number'] = page_num
            all_data.append(df)

        # Add a delay between requests to be polite to the server
        time.sleep(1)

    if all_data:
        # Combine all DataFrames
        final_df = pd.concat(all_data, ignore_index=True)

        # Save to CSV
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        filename = f'turnbackhoax_data_{timestamp}.csv'
        final_df.to_csv(filename, index=False)

        print(f"\nScraping completed successfully!")
        print(f"Total articles scraped: {len(final_df)}")
        print(f"Data saved to: {filename}")

        return final_df
    else:
        print("\nNo data was scraped successfully.")
        return None


In [None]:
if __name__ == "__main__":
    print("Starting the scraping process...")
    df = scrape_all_pages()
    if df is not None:
        print("\nSample of scraped data:")
        print(df.head())

Starting the scraping process...


Scraping page 226: 100%|██████████| 188/188 [06:58<00:00,  2.23s/it]


Scraping completed successfully!
Total articles scraped: 3760
Data saved to: turnbackhoax_data_20250725_065758.csv

Sample of scraped data:
                                               title  \
0     [SALAH] Sidang Jokowi Dituntut Ratusan Triliun   
1  [SALAH] Hakim Kasus Harvey Moeis Terima Uang S...   
2  [SALAH] Mahfud MD Resmi Dilantik sebagai Jaksa...   
3  [SALAH] Bela Jokowi, Kader PDIP Caci Maki Hast...   
4  [PENIPUAN] Akun Facebook “mr.terimakasih berba...   

                                                 url  \
0  https://turnbackhoax.id/2025/01/07/salah-sidan...   
1  https://turnbackhoax.id/2025/01/07/salah-hakim...   
2  https://turnbackhoax.id/2025/01/07/salah-mahfu...   
3  https://turnbackhoax.id/2025/01/07/salah-bela-...   
4  https://turnbackhoax.id/2025/01/06/penipuan-ak...   

                                             preview  \
0  Tidak ditemukan informasi atau pemberitaan kre...   
1  Tidak ditemukan artikel atau informasi yang me...   
2  Mahfud MD dala




# Step 2 - Scraping Full Content

## Clean Up HTML

In [None]:
def clean_html_content(html_content):
    """
    Clean HTML content and format it into readable paragraphs
    """
    # Remove script and style elements
    for script in html_content.find_all(['script', 'style']):
        script.decompose()

    # Get text and clean it
    text = html_content.get_text(separator=' ')

    # Remove extra whitespace and newlines
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n\s*\n', '\n\n', text)

    # Create proper paragraphs
    paragraphs = text.split('\n\n')
    cleaned_paragraphs = [p.strip() for p in paragraphs if p.strip()]

    return '\n\n'.join(cleaned_paragraphs)


## Scrape Articles

In [None]:
def scrape_article(url):
    """
    Scrape specific elements from a turnbackhoax article
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Get category
        category_element = soup.find('span', class_='entry-meta-categories')
        category = category_element.a.text if category_element and category_element.a else ''

        # Get article content
        content_div = soup.find('div', class_='entry-content mh-clearfix')
        content = clean_html_content(content_div) if content_div else ''

        return {
            'url': url,
            'category': category,
            'content': content
        }

    except Exception as e:
        print(f"\nError scraping {url}: {e}")
        return {
            'url': url,
            'category': '',
            'content': f'Error: {str(e)}'
        }


In [None]:
def process_google_sheet():
    """
    Process Google Sheet and scrape articles
    """
    # URL of the Google Sheet (export as CSV)
    sheet_id = ""
    sheet_name = ""  # gid from the URL
    csv_url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv&gid={sheet_name}"

    try:
        # Read Google Sheet
        print("Reading Google Sheet...")
        df = pd.read_csv(csv_url)

        # Filter rows where column H is True
        df_filtered = df[df.iloc[:, 7] == True]  # Column H is index 7
        urls = df_filtered.iloc[:, 2].tolist()  # Column C is index 2

        print(f"Found {len(urls)} URLs to process")

        # Initialize list to store results
        results = []

        # Process each URL with progress bar
        for url in tqdm(urls, desc="Scraping articles"):
            result = scrape_article(url)
            results.append(result)
            time.sleep(1)  # Be polite to the server

        # Create DataFrame from results
        results_df = pd.DataFrame(results)

        # Save to CSV with timestamp
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        filename = f'turnbackhoax_articles_{timestamp}.csv'
        results_df.to_csv(filename, index=False)

        print(f"\nScraping completed successfully!")
        print(f"Total articles scraped: {len(results_df)}")
        print(f"Data saved to: {filename}")

        return results_df

    except Exception as e:
        print(f"Error processing Google Sheet: {e}")
        return None


In [None]:
if __name__ == "__main__":
    print("Starting the scraping process...")
    df = process_google_sheet()
    if df is not None:
        print("\nSample of scraped data:")
        print(df.head())

Starting the scraping process...
Reading Google Sheet...
Found 3760 URLs to process


Scraping articles: 100%|██████████| 3760/3760 [2:23:42<00:00,  2.29s/it]



Scraping completed successfully!
Total articles scraped: 3760
Data saved to: turnbackhoax_articles_20250725_122137.csv

Sample of scraped data:
                                                 url               category  \
0  https://turnbackhoax.id/2025/01/07/salah-sidan...  Fitnah / Hasut / Hoax   
1  https://turnbackhoax.id/2025/01/07/salah-hakim...  Fitnah / Hasut / Hoax   
2  https://turnbackhoax.id/2025/01/07/salah-mahfu...  Fitnah / Hasut / Hoax   
3  https://turnbackhoax.id/2025/01/07/salah-bela-...  Fitnah / Hasut / Hoax   
4  https://turnbackhoax.id/2025/01/06/penipuan-ak...  Fitnah / Hasut / Hoax   

                                             content  
0  Tidak ditemukan informasi atau pemberitaan kre...  
1  Tidak ditemukan artikel atau informasi yang me...  
2  Mahfud MD dalam akun Instagram resminya menjel...  
3  Tidak ditemukan informasi atau pemberitaan kre...  
4  Akun resmi Sergei Domogatski atau yang dikenal...  


# Step 3 - Cleansing Article Content

## Data Cleaning

In [None]:
# URL of the Google Sheet (export as CSV)
sheet_id = ""
sheet_name = ""  # gid from the URL
csv_url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv&gid={sheet_name}"

# Read the sheet
df = pd.read_csv(csv_url)

def process_text(text):
    if not isinstance(text, str):
        return text

    # Pattern for continuous "=" (like "=====" of any length)
    text = re.sub(r'={2,}', r'\n\g<0>\n', text)

    # Pattern for spaced "=" (like "= = =" with any number of "=")
    text = re.sub(r'(?:=\s+){2,}=', r'\n\g<0>\n', text)

    return text

# Process column G starting from row 2
df.iloc[1:, 6] = df.iloc[1:, 6].apply(process_text)

# Save to CSV
df.to_csv('processed_sheet.csv', index=False)

# Save to text file
with open('processed_column_g.txt', 'w', encoding='utf-8') as f:
    for text in df.iloc[1:, 6]:
        f.write(text + '\n\n')

print("Sample of processed text:")
print(df.iloc[1, 6])

Sample of processed text:
Tim Kalimasada
