In [1]:

!pip install requests beautifulsoup4 langdetect transformers googletrans==4.0.0-rc1




# Import the required library

In [2]:
import requests
from bs4 import BeautifulSoup
from langdetect import detect
from googletrans import Translator
from transformers import pipeline


In [3]:
summarizer = pipeline("summarization", model="facebook/mbart-large-cc25")
results = []

Device set to use mps:0


# Create a function for web scrapping the articles from multiple sites

In [4]:
def get_news_articles(base_url, article_path_substring, full_prefix, limit=2):
    res = requests.get(base_url)
    soup = BeautifulSoup(res.text, 'html.parser')
    articles = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        if article_path_substring in href:
            if not href.startswith('http'):
                href = full_prefix + href
            articles.append(href)
    return list(set(articles))[:limit]

# Function to extract text from article

In [5]:
def extract_article_text(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    main = soup.find('main') or soup
    paragraphs = main.find_all('p')
    content = ' '.join([p.text for p in paragraphs])
    if len(content.split()) < 50:
        raise ValueError("Article content too short.")
    return content

# Detect language using langdetect

In [6]:
import sys
!{sys.executable} -m pip install langdetect



In [7]:
def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"


# Translate to English using google translate

In [8]:
def translate_to_english(text):
    translator = Translator()
    try:
        translated = translator.translate(text, dest='en')
        return translated.text if translated and translated.text else ""
    except Exception as e:
        print("Translation failed:", e)
        return ""

# Summarize using Hugging Face Transformers

In [9]:
def summarize_text(text):
    max_len = min(150, max(50, len(text.split()) // 2))
    return summarizer(text, max_length=max_len, min_length=30, do_sample=False)[0]['summary_text']


# Run for Multiple languages

In [10]:
news_sources = {
    "Hindi":    ("https://www.bbc.com/hindi", "/hindi/articles/", "https://www.bbc.com"),
    "Arabic":   ("https://www.bbc.com/arabic", "/arabic/articles/", "https://www.bbc.com"),
    "Spanish":  ("https://www.bbc.com/mundo", "/mundo/articles/", "https://www.bbc.com"),
    "Russian":  ("https://www.bbc.com/russian", "/russian/articles/", "https://www.bbc.com"),
    "French":   ("https://www.bbc.com/afrique", "/afrique/articles/", "https://www.bbc.com"),
    "German":   ("https://www.dw.com/de/top-thema/s-9012", "/de/", "https://www.dw.com")
}

for lang, (url, substring, prefix) in news_sources.items():
    print(f"\nLanguage: {lang}")
    article_urls = get_news_articles(url, substring, prefix)
    for a_url in article_urls:
        print(f"\nURL: {a_url}")
        try:
            text = extract_article_text(a_url)
            detected = detect_language(text)
            print("Detected:", detected)
            if detected != 'en':
                text = translate_to_english(text)
            if not text:
                print("Skipping due to empty translation.")
                continue
            summary = summarize_text(text)
            print("Summary:\n", summary)
            results.append({
                "source_language": lang,
                "url": a_url,
                "detected_language": detected,
                "summary": summary
            })
        except Exception as e:
            print("Error:", e)


🌐 Language: Hindi

🔗 URL: https://www.bbc.com/hindi/articles/cjrl472zz8eo
🔤 Detected: hi
Translation failed: the JSON object must be str, bytes or bytearray, not NoneType
❌ Skipping due to empty translation.

🔗 URL: https://www.bbc.com/hindi/articles/c4g8x33z6xlo
🔤 Detected: hi
📝 Summary:
 for more information about the discussion about the Sejil missile has intensified.After the discussion about the discussion about the invasion."

🌐 Language: Arabic

🔗 URL: https://www.bbc.com/arabic/articles/cn4gegejyj3o
🔤 Detected: ar
📝 Summary:
 for the first patients of course of course of the United Kingdom.Photo chest, but if you see these drugs are within these are not considered to be less than to be more than a lighter that to be more dangerous to be more dangerous to treating to disrupted to disrupted to treating to treat for a long term recovery, a period of life.

🔗 URL: https://www.bbc.com/arabic/articles/cp8e9g7j975o
🔤 Detected: ar
Translation failed: the JSON object must be str, bytes

# Store the summary in a csv file

In [12]:
import pandas as pd
if results:
    df = pd.DataFrame(results)
    df.to_csv("multilingual_summaries.csv", index=False)
    print("✅ Results saved to multilingual_summaries.csv")
else:
    print("⚠️ No results to save.")

✅ Results saved to multilingual_summaries.csv


In [16]:
pd.set_option('display.max_colwidth', None)
df

Unnamed: 0,source_language,url,detected_language,summary
0,Hindi,https://www.bbc.com/hindi/articles/c4g8x33z6xlo,hi,"for more information about the discussion about the Sejil missile has intensified.After the discussion about the discussion about the invasion."""
1,Arabic,https://www.bbc.com/arabic/articles/cn4gegejyj3o,ar,"for the first patients of course of course of the United Kingdom.Photo chest, but if you see these drugs are within these are not considered to be less than to be more than a lighter that to be more dangerous to be more dangerous to treating to disrupted to disrupted to treating to treat for a long term recovery, a period of life."
2,Spanish,https://www.bbc.com/mundo/articles/cwyne84kmdxo,es,"you can also denounced the methods, which included armored, by the terror, by, by, by, by, by, by, by, by, by, by, by, by, by, by, by, by, by, by, terror, by, by, by, by, by, by, by, by, by by, terror in terror in terror in our streets to detainment, terror in our streets to protest against immigration measures ordered by the Donald Trump government.And remember that you can receive notifications in our app.Download the.Download the latest version"
3,German,https://www.dw.com/de/themen/s-9077,de,"the attacks attack war brings suffering and destruction, affects global stability and requires international reactions, affects, affects, affects, affects, affects, affects, affects and requires international reactions"
