# Download a bunch of news article links from RSS to a MongoDB

In [1]:
import feedparser
from datetime import date
import pymongo

# define the MongoDB client and database
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["mydatabase"]
collection = db["articles_by_source"]

news_feeds = {
    'France 24': ['https://www.france24.com/en/rss', 'https://www.france24.com/en/europe/rss'],
    'DW': ['https://rss.dw.com/rdf/rss-en-all'],
    'SVT': ['https://www.svt.se/nyheter/rss.xml'],
    'SvD': ['https://www.svd.se/?service=rss'],
    'Technode': ['https://technode.com/feed/'],
    'PingWest': ['https://en.pingwest.com/feed'],
    'China Daily': ['http://www.chinadaily.com.cn/rss/china_rss.xml'],
    'Xinhua News Agency': ['http://www.xinhuanet.com/english/rss/worldrss.xml', 'http://www.xinhuanet.com/english/rss/chinarss.xml'],
    'TechCrunch': ['https://techcrunch.com/feed/'],
    'The Robot Report': ['https://www.therobotreport.com/feed/'],
    'CNN': ['http://rss.cnn.com/rss/edition.rss'],
    'VOA News': ['https://www.voanews.com/api/zqbomekvi_'],
    'NPR': ['https://feeds.npr.org/500005/podcast.xml'],
    'The Times of Israel': ['https://www.timesofisrael.com/feed/']
}

def get_articles_from_rss(rss_url):
    feed = feedparser.parse(rss_url)
    articles = []
    for entry in feed.entries:
        articles.append({
            "title": entry.get("title"),
            "link": entry.get("link"),
            "published": entry.get("published"),
        })
    return articles

articles_by_source = {}
current_date = str(date.today())

for source, feeds in news_feeds.items():
    articles = []
    for feed in feeds:
        feed_articles = get_articles_from_rss(feed)
        articles.extend(feed_articles)
    article_links = [article['link'] for article in articles if article['link'] is not None]
    articles_by_source[source] = {current_date: article_links}

# insert the dictionary of articles into MongoDB
collection.insert_one(articles_by_source)

# print the inserted data
documents = collection.find()
for document in documents:
    print(document)

{'_id': ObjectId('64278e35a5d578a35f5dc660'), 'source': 'France24', 'date': '2023-04-01', 'links': ['https://www.france24.com/en/europe/20230331-western-arms-supplies-no-guarantee-of-a-decisive-victory-for-ukraine', 'https://www.france24.com/en/americas/20230331-now-that-donald-trump-has-been-indicted-what-happens-next', 'https://www.france24.com/en/europe/20230331-%F0%9F%94%B4-live-one-year-on-ukraine-s-bucha-remembers-its-victims-and-tries-to-rebuild', 'https://www.france24.com/en/africa/20230331-south-africa-s-oscar-pistorius-denied-parole-a-decade-after-killing-girlfriend', 'https://www.france24.com/en/europe/20230331-italy-blocks-openai-s-chatgpt-opens-probe-over-privacy-failings', 'https://www.france24.com/en/sport/20230331-wimbledon-lifts-ban-on-russian-belarusian-players-to-compete-as-neutrals', 'https://www.france24.com/en/europe/20230330-turkey-s-parliament-ratifies-finland-s-application-to-join-nato', 'https://www.france24.com/en/americas/20230330-trump-indicted-on-criminal-

# Show the output of the MongoDB

In [2]:
import pymongo
from pprint import pprint

# connect to MongoDB
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["mydatabase"]

# select the collection to query
collection = db["articles_by_source"]

# find all documents in the collection
documents = collection.find()

# iterate over the documents and pretty print each document
for document in documents:
    pprint(document)

{'_id': ObjectId('64278e35a5d578a35f5dc660'),
 'date': '2023-04-01',
 'links': ['https://www.france24.com/en/europe/20230331-western-arms-supplies-no-guarantee-of-a-decisive-victory-for-ukraine',
           'https://www.france24.com/en/americas/20230331-now-that-donald-trump-has-been-indicted-what-happens-next',
           'https://www.france24.com/en/europe/20230331-%F0%9F%94%B4-live-one-year-on-ukraine-s-bucha-remembers-its-victims-and-tries-to-rebuild',
           'https://www.france24.com/en/africa/20230331-south-africa-s-oscar-pistorius-denied-parole-a-decade-after-killing-girlfriend',
           'https://www.france24.com/en/europe/20230331-italy-blocks-openai-s-chatgpt-opens-probe-over-privacy-failings',
           'https://www.france24.com/en/sport/20230331-wimbledon-lifts-ban-on-russian-belarusian-players-to-compete-as-neutrals',
           'https://www.france24.com/en/europe/20230330-turkey-s-parliament-ratifies-finland-s-application-to-join-nato',
           'https://www.fran

# Serve the mongoDB as a flask API

In [3]:
from flask import Flask, jsonify
from flask_pymongo import PyMongo

app = Flask(__name__)
app.config['MONGO_URI'] = 'mongodb://localhost:27017/mydatabase'
mongo = PyMongo(app)

@app.route('/articles')
def get_articles():
    # connect to MongoDB and select the collection to query
    collection = mongo.db.articles_by_source
    
    # find all documents in the collection and convert them to a list of dictionaries
    documents = collection.find()
    articles = []
    for document in documents:
        # convert the ObjectId to string
        document['_id'] = str(document['_id'])
        articles.append(document)
    
    # convert the list of dictionaries to a JSON serializable format
    response = jsonify(articles)
    
    return response

if __name__ == '__main__':
    app.run(debug=True, use_reloader=False)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [01/Apr/2023 04:39:36] "GET / HTTP/1.1" 404 -
127.0.0.1 - - [01/Apr/2023 04:39:39] "GET /articles HTTP/1.1" 200 -


# Delete stuff

In [None]:
import pymongo

# connect to MongoDB and select the collection to modify
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["mydatabase"]
collection = db["articles_by_source"]

# delete all documents that have the "articles" key
delete_result = collection.delete_many({"articles": {"$exists": True}})
print(f"Deleted {delete_result.deleted_count} documents.")

Deleted 1 documents.


# Get all the keys in the MongoDB

In [None]:
from pymongo import MongoClient

client = MongoClient('mongodb://localhost:27017/')
db = client.mydatabase
collection = db.articles_by_source

document = collection.find_one()
print(document.keys())

dict_keys(['_id', 'France 24', 'DW', 'SVT', 'SvD', 'TASS', 'RT', 'Technode', 'PingWest', 'China Daily', 'Xinhua News Agency', 'TechCrunch', 'The Robot Report', 'CNN', 'VOA News', 'NPR', 'The Times of Israel'])


# Drop database

In [None]:
from pymongo import MongoClient

client = MongoClient('mongodb://localhost:27017/')
client.drop_database('mydatabase')