In [4]:
import os
import json
import time
import random
import zipfile
import requests
import pandas as pd
import csv
import re
from bs4 import BeautifulSoup
import glob

# Web Scrapping
- Scraping articles from english websites of popular News Websites Geo News and Samaa TV

<h3> Geo News: <h3>

In [None]:
class Geo_Scraper:
    def __init__(self,id_=0):
        self.id = id_

    def get_geo_articles(self, max_articles_per_category=100):
        geo_df = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "gold_label": [],
        }
        categories = {
            "sports": "https://www.geo.tv/category/sports",
            "science": "https://www.geo.tv/category/sci-tech",
            "business": "https://www.geo.tv/category/business",
            "world": "https://www.geo.tv/category/world",
            "entertainment": "https://www.geo.tv/category/entertainment"
        }
        for category, url in categories.items():
            article_count = 0
            print(f"Scraping articles for category '{category}'...")
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")
            articles = soup.find_all("li", class_="border-box")
            if not articles:
                print(f"No articles found in category '{category}'")
                continue
            for article in articles:
                title_tag = article.find("a", class_="open-section")
                title = title_tag.get("title", "Title not found")
                link = title_tag["href"]
                if link.endswith('-'):
                    article_id = link.split('-')[-1] 
                    if len(article_id) <= 3:
                        next_two_digits = article_id[2:]  
                        link = link + next_two_digits
                    else:
                        link = link + article_id[:2]  
                article_response = requests.get(link)
                article_response.raise_for_status()
                article_soup = BeautifulSoup(article_response.text, "html.parser")
                content_div = article_soup.find("div", class_="content-area")
                paragraphs = content_div.find_all("p") if content_div else []
                content = " ".join(p.get_text(strip=True) for p in paragraphs)
                geo_df["id"].append(self.id)
                geo_df["title"].append(title)
                geo_df["link"].append(link)
                geo_df["content"].append(content)
                geo_df["gold_label"].append(category)
                self.id += 1
                article_count += 1
                print(f"\t--> Scraped article {article_count} in category '{category}'.")
                if article_count >= max_articles_per_category:
                    break
            print(f"Completed scraping {article_count} articles from category '{category}'.")
        df = pd.DataFrame(geo_df)
        return df
    
    def save_to_csv(self, df, filename="geo_articles.csv"):
        with open(filename, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.writer(file, quotechar='"', quoting=csv.QUOTE_ALL)
            writer.writerow(["ID", "Title", "Link", "Content", "Gold Label"])
            for _, row in df.iterrows():
                writer.writerow([row["id"], row["title"], row["link"], row["content"], row["gold_label"]])
        print(f"Articles saved to {filename}")


Scraping articles for category 'sports'...
	--> Scraped article 1 in category 'sports'.
	--> Scraped article 2 in category 'sports'.
	--> Scraped article 3 in category 'sports'.
	--> Scraped article 4 in category 'sports'.
	--> Scraped article 5 in category 'sports'.
	--> Scraped article 6 in category 'sports'.
	--> Scraped article 7 in category 'sports'.
	--> Scraped article 8 in category 'sports'.
	--> Scraped article 9 in category 'sports'.
	--> Scraped article 10 in category 'sports'.
	--> Scraped article 11 in category 'sports'.
	--> Scraped article 12 in category 'sports'.
	--> Scraped article 13 in category 'sports'.
	--> Scraped article 14 in category 'sports'.
	--> Scraped article 15 in category 'sports'.
	--> Scraped article 16 in category 'sports'.
	--> Scraped article 17 in category 'sports'.
	--> Scraped article 18 in category 'sports'.
	--> Scraped article 19 in category 'sports'.
	--> Scraped article 20 in category 'sports'.
	--> Scraped article 21 in category 'sports'.


<h3> Samaa TV: <h3>

In [None]:
class SamaaScraper:
    def __init__(self):
        self.base_url = "https://www.samaa.tv/"  
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
        }
        self.articles = []

    def fetch_category_articles(self, category_url, category_name):
        """
        Fetches up to 80 articles from a category page with pagination.
        """
        page = 1
        while len([a for a in self.articles if a['category'] == category_name]) < 200:
            url = f"{category_url}?page={page}"
            print(f"Fetching page: {url}")
            response = requests.get(url, headers=self.headers)  
            
            print("response: ", response)
            if response.status_code == 403:
                print(f"Access forbidden (403) for {url}. Check headers or other access restrictions.")
                break
            elif response.status_code != 200:
                print(f"Error fetching page {page} of {category_name}. Status code: {response.status_code}")
                break

            soup = BeautifulSoup(response.text, 'html.parser')
            
            articles_in_category = []
            for article in soup.select('article.story-article'):
                title = article.h3.a.text.strip()
                link = article.h3.a['href']
                if not link.startswith("http"):
                    link = self.base_url + link
                full_content = self.fetch_article_content(link)
                articles_in_category.append({
                    'title': title,
                    'link': link,
                    'content': full_content,
                    'category': category_name,
                    'id': len(self.articles) + len(articles_in_category)
                })
                if len(articles_in_category) + len([a for a in self.articles if a['category'] == category_name]) >= 200:
                    break
            
            if not articles_in_category:
                break 
            
            self.articles.extend(articles_in_category)
            page += 1

    def fetch_article_content(self, url):
        """
        Fetches the full content of an article from its URL.
        """
        response = requests.get(url, headers=self.headers)  
        article_soup = BeautifulSoup(response.text, 'html.parser')
        content_div = article_soup.find('div', class_='article-content')
        
        if content_div:
            paragraphs = content_div.find_all('p')
            full_content = ' '.join(paragraph.text.strip() for paragraph in paragraphs)
            return full_content
        else:
            print(f"Warning: No content found for URL {url}")
            return ""

    def scrape(self, categories):
        """
        Main scraping function to fetch articles from multiple categories.
        """
        for category_name, category_url in categories.items():
            print(f"Scraping category: {category_name}")
            self.fetch_category_articles(category_url, category_name)
            print(f"Found {len([a for a in self.articles if a['category'] == category_name])} articles in {category_name}.")

    def save_to_csv(self, filename="articles.csv"):
        """
        Saves the scraped articles to a CSV file with quotes around text fields.
        """
        with open(filename, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.writer(file, quoting=csv.QUOTE_ALL)  
            writer.writerow(["ID", "Category", "Title", "Link", "Content"])
            for article in self.articles:
                writer.writerow([article['id'], article['category'], article['title'], article['link'], article['content']])
        print(f"Articles saved to {filename}")

Scraping category: Business
Fetching page: https://www.samaa.tv/money?page=1
response:  <Response [200]>
Fetching page: https://www.samaa.tv/money?page=2
response:  <Response [200]>
Fetching page: https://www.samaa.tv/money?page=3
response:  <Response [200]>
Fetching page: https://www.samaa.tv/money?page=4
response:  <Response [200]>
Fetching page: https://www.samaa.tv/money?page=5
response:  <Response [200]>
Found 200 articles in Business.
Scraping category: Science-Technology
Fetching page: https://www.samaa.tv/tech?page=1
response:  <Response [200]>
Fetching page: https://www.samaa.tv/tech?page=2
response:  <Response [200]>
Fetching page: https://www.samaa.tv/tech?page=3
response:  <Response [200]>
Fetching page: https://www.samaa.tv/tech?page=4
response:  <Response [200]>
Found 200 articles in Science-Technology.
Scraping category: International
Fetching page: https://www.samaa.tv/global?page=1
response:  <Response [200]>
Fetching page: https://www.samaa.tv/global?page=2
response: 

<h4> Store data scraped from the scrapers into csv files: <h4>

In [None]:
geo_scraper = Geo_Scraper()
df = geo_scraper.get_geo_articles(max_articles_per_category=60)
geo_scraper.save_to_csv(df, filename="geo_articles.csv")


categories = {
    "business": "https://urdu.samaa.tv/money",
    "science": "https://urdu.samaa.tv/tech",
    "world": "https://urdu.samaa.tv/global",
    "sports": "https://urdu.samaa.tv/sports",
    "entertainment": "https://urdu.samaa.tv/lifestyle",
}

scraper = SamaaScraper()
scraper.scrape(categories)
scraper.save_to_csv("samaa_articles.csv")

<h4> Generate a combined CSV file: <h4>

In [10]:
geo_df = pd.read_csv("geo_articles.csv")
samaa_df = pd.read_csv("samaa_articles.csv")

samaa_df = samaa_df.rename(columns={"Category": "Gold Label"})

desired_columns = ["ID", "Title", "Link", "Content", "Gold Label"]
geo_df = geo_df[desired_columns]
samaa_df = samaa_df[desired_columns]

merged_df = pd.concat([geo_df, samaa_df], ignore_index=True)

merged_df["ID"] = range(len(merged_df))

merged_df.to_csv("combined_articles.csv", index=False)

print("Merged file saved as 'combined_articles.csv'")

Merged file saved as 'combined_articles.csv'
