<a href="https://colab.research.google.com/github/anokhina-rgb/Multilingual-Corpus-for-EU-Studies/blob/main/news_scraper_gui_full.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import feedparser
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from datetime import datetime
from docx import Document
from newspaper import Article  # ← NEW

# RSS feeds
FEEDS = {
    'BBC World': 'http://feeds.bbci.co.uk/news/world/rss.xml',
    'CNN World': 'http://rss.cnn.com/rss/edition_world.rss',
    'Reuters World': 'http://feeds.reuters.com/Reuters/worldNews',
    'Al Jazeera': 'https://www.aljazeera.com/xml/rss/all.xml'
}

# NEW: Function to extract full article content
def get_full_article_content(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return article.text.strip()
    except Exception as e:
        print(f"Error downloading article: {url}\n{e}")
        return ""

# GUI App
class NewsScraperApp:
    def __init__(self, root):
        self.root = root
        self.root.title("News Scraper")
        self.root.geometry("850x600")

        # Controls
        self.keyword_var = tk.StringVar()
        self.date_var = tk.StringVar()

        tk.Label(root, text="Filter by keyword:").pack()
        tk.Entry(root, textvariable=self.keyword_var, width=50).pack()

        tk.Label(root, text="Filter by date (YYYY-MM-DD):").pack()
        tk.Entry(root, textvariable=self.date_var, width=20).pack()

        tk.Button(root, text="Fetch Articles", command=self.fetch_articles).pack(pady=5)

        self.tree = ttk.Treeview(root, columns=("Source", "Title"), show="headings", selectmode="extended")
        self.tree.heading("Source", text="Source")
        self.tree.heading("Title", text="Title")
        self.tree.pack(expand=True, fill='both')

        tk.Button(root, text="Save Selected to TXT", command=lambda: self.save_articles("txt")).pack(pady=2)
        tk.Button(root, text="Save Selected to DOCX", command=lambda: self.save_articles("docx")).pack(pady=2)

        self.articles = []

    def fetch_articles(self):
        keyword = self.keyword_var.get().lower()
        date_str = self.date_var.get()
        try:
            filter_date = datetime.strptime(date_str, "%Y-%m-%d").date() if date_str else None
        except ValueError:
            messagebox.showerror("Error", "Date must be in YYYY-MM-DD format.")
            return

        self.articles.clear()
        for i in self.tree.get_children():
            self.tree.delete(i)

        for source, url in FEEDS.items():
            feed = feedparser.parse(url)
            for entry in feed.entries:
                entry_date = None
                if hasattr(entry, 'published_parsed'):
                    entry_date = datetime(*entry.published_parsed[:6]).date()

                if filter_date and entry_date != filter_date:
                    continue
                if keyword and keyword not in entry.title.lower():
                    continue

                full_text = get_full_article_content(entry.link)  # ← NEW

                self.articles.append({
                    "source": source,
                    "title": entry.title,
                    "link": entry.link,
                    "summary": full_text if full_text else (entry.summary if hasattr(entry, "summary") else ""),
                    "date": str(entry_date) if entry_date else "Unknown"
                })
                self.tree.insert('', 'end', values=(source, entry.title))

    def save_articles(self, filetype):
        selected_items = self.tree.selection()
        if not selected_items:
            messagebox.showwarning("No Selection", "Please select at least one article.")
            return

        filetypes = [("Text files", "*.txt")] if filetype == "txt" else [("Word Document", "*.docx")]
        ext = "txt" if filetype == "txt" else "docx"
        file = filedialog.asksaveasfilename(defaultextension=f".{ext}", filetypes=filetypes)
        if not file:
            return

        selected_articles = [self.articles[self.tree.index(i)] for i in selected_items]

        if filetype == "txt":
            with open(file, "w", encoding="utf-8") as f:
                for a in selected_articles:
                    f.write(f"{a['title']} ({a['source']}, {a['date']})\n{a['link']}\n{a['summary']}\n\n")
        else:
            doc = Document()
            for a in selected_articles:
                doc.add_heading(a['title'], level=2)
                doc.add_paragraph(f"Source: {a['source']}, Date: {a['date']}")
                doc.add_paragraph(f"Link: {a['link']}")
                doc.add_paragraph(a['summary'])
                doc.add_paragraph("")
            doc.save(file)

        messagebox.showinfo("Saved", f"{len(selected_articles)} articles saved to {file}.")

# Run the app
if __name__ == "__main__":
    root = tk.Tk()
    app = NewsScraperApp(root)
    root.mainloop()
