In [None]:
pip install googletrans==4.0.0-rc1

Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2024.12.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->goog

In [None]:
#%% Import Libraries
import pandas as pd
import os
from concurrent.futures import ThreadPoolExecutor
import unicodedata
import requests
from bs4 import BeautifulSoup
from googletrans import Translator

In [None]:
#%% Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#%% Translator Initialization
translator = Translator()

In [None]:
#%% File Paths
input_file = '/content/drive/MyDrive/Colab Notebooks/epiwatch-latest.csv'  # Input CSV in Google Drive
output_folder = '/content/drive/MyDrive/Colab Notebooks/processed_batches'  # Folder to save processed files
os.makedirs(output_folder, exist_ok=True)
checkpoint_file = '/content/drive/MyDrive/Colab Notebooks/checkpoint_china.txt'  # File to store the last processed batch

In [None]:
#%% Function to load the checkpoint
def load_checkpoint():
    """Load the last processed batch index from the checkpoint file."""
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'r') as file:
            return int(file.read().strip())
    return 0  # Default to starting from the beginning

#%% Function to save the checkpoint
def save_checkpoint(batch_index):
    """Save the last processed batch index to the checkpoint file."""
    with open(checkpoint_file, 'w') as file:
        file.write(str(batch_index))

In [None]:
#%% Function to clean text
def clean_content(content):
    """Clean and normalize the text content."""
    content = unicodedata.normalize('NFKD', content)
    content = content.replace('“', '"').replace('”', '"')
    content = content.replace("‘", "'").replace("’", "'")
    content = content.replace("\n", " ").strip()
    return content

#%% Function to fetch content with timeout
def fetch_full_content(url, timeout=3):
    """Fetch content from a URL with a hard timeout."""
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=timeout)
        response.encoding = response.apparent_encoding
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            paragraphs = [p.get_text() for p in soup.find_all('p')]
            return ' '.join(paragraphs).strip() if paragraphs else "Error: No content found"
        return f"Error {response.status_code}: Unable to fetch content"
    except requests.exceptions.RequestException as e:
        return f"Error: {str(e)}"

#%% Function to translate content
def translate_content(content):
    """Translate non-English content to English."""
    try:
        if not content or content.startswith("Error"):
            return content  # Skip translation for errors or empty content
        detected_lang = translator.detect(content).lang
        if detected_lang != 'en':
            translated = translator.translate(content, src=detected_lang, dest='en')
            return translated.text
        return content  # Content is already in English
    except Exception as e:
        return f"Translation Error: {str(e)}"

#%% Combined function to process a row
def process_row(row):
    """Fetch and translate content for a single row."""
    url = row['url']
    if not isinstance(url, str) or not url.strip():
        return "Error: Invalid or empty URL"

    try:
        content = fetch_full_content(url)
        return translate_content(content)
    except Exception as e:
        return f"Error: {str(e)}"

In [None]:
#%% Process Data in Batches with Checkpoints
# Load the data and filter for the country
df = pd.read_csv(input_file)
df = df[df['country'].isin(['China'])]  # Filter for selected country
grouped = df.groupby('country')

# Load the last processed batch index
start_batch_index = load_checkpoint()
print(f"Resuming from batch index {start_batch_index}...")

# Get already processed files
processed_batches = set(os.listdir(output_folder))

Resuming from batch index 55...


In [None]:
for country, group in grouped:
    print(f"Processing country: {country}")
    group = group.reset_index(drop=True)  # Reset index for clean batching

    # Process data in batches of 100
    total_records = len(group)
    total_batches = (total_records // 100) + (1 if total_records % 100 != 0 else 0)

    for batch_index, i in enumerate(range(0, total_records, 100)):
        # Skip already processed batches and start from the checkpoint
        if batch_index < start_batch_index:
            continue

        try:
            # Generate batch file name
            batch_file = os.path.join(output_folder, f"{country}_batch_{batch_index + 1}.csv")

            # Skip if batch already processed
            if os.path.basename(batch_file) in processed_batches:
                print(f"Skipping already processed batch: {batch_file}")
                continue

            # Extract the current batch
            batch = group.iloc[i:i + 100].copy()

            # Process rows in the batch using multithreading
            with ThreadPoolExecutor(max_workers=5) as executor:
                batch['Translated_Content'] = list(executor.map(process_row, [row for _, row in batch.iterrows()]))

            # Save the processed batch to an individual CSV file
            batch.to_csv(batch_file, index=False)
            print(f"Batch {batch_index + 1}/{total_batches} for country {country} processed and saved to {batch_file}.")

            # Save the checkpoint after processing the batch
            save_checkpoint(batch_index)

        except Exception as e:
            print(f"Error processing batch {batch_index + 1}/{total_batches} for country {country}: {str(e)}")
            save_checkpoint(batch_index)  # Save checkpoint even on error

Processing country: China
Skipping already processed batch: /content/drive/MyDrive/Colab Notebooks/processed_batches/China_batch_56.csv
Batch 57/61 for country China processed and saved to /content/drive/MyDrive/Colab Notebooks/processed_batches/China_batch_57.csv.
Batch 58/61 for country China processed and saved to /content/drive/MyDrive/Colab Notebooks/processed_batches/China_batch_58.csv.
Batch 59/61 for country China processed and saved to /content/drive/MyDrive/Colab Notebooks/processed_batches/China_batch_59.csv.
Batch 60/61 for country China processed and saved to /content/drive/MyDrive/Colab Notebooks/processed_batches/China_batch_60.csv.
Batch 61/61 for country China processed and saved to /content/drive/MyDrive/Colab Notebooks/processed_batches/China_batch_61.csv.
