In [12]:
import feedparser
from feedparser.util import FeedParserDict
import os
import requests
from urllib.parse import urlparse
import zipfile
import pandas as pd

## **EU CORDIS DATALOADER**
This notebook is designed to connect to specified RSS URLs from Cordis EU data. It enables the download of all CSV.zip files and the subsequent extraction of these files for further analysis.

## **TABLE OF CONTENTS**:
- [HOW TO USE](#1)
- [EXAMPLE OF RRS FEED CONTENTS](#2)
- [IMPORT DATA](#3)

## **HOW TO USE**
1. Run the entire notebook to automatically produce 3 folders with all the data from the 3 Cordis EU datasets:
   - eu_cordis_2007_2013
   - eu_cordis_2014_2020
   - eu_cordis_2021_2027


In [13]:
def get_rss_feed(rss_url: str) -> FeedParserDict:
    """
    Fetches and parses the RSS feed from the given URL.
    """
    print('Getting RSS feed...')
    feed = feedparser.parse(rss_url)
    title = feed['feed'].get('title', 'N/A')
    print(f'Feed title = {title}')
    return feed


def extract_enclosure_urls(feed):
    """
    Extracts and returns enclosure URLs from the RSS feed.
    """
    urls = []
    for entry in feed.get('entries', []):
        enclosures = entry.get('enclosures', [])
        for enclosure in enclosures:
            url = enclosure.get('href', 'N/A')
            if url != 'N/A':
                urls.append(url)
    return urls


def print_rss_feed(feed):
    """
    Prints the channel information and items from the RSS feed.
    """
    channel = feed['feed']
    title = channel.get('title', 'N/A')
    link = channel.get('link', 'N/A')
    description = channel.get('description', 'N/A')
    language = channel.get('language', 'N/A')
    pub_date = channel.get('published', 'N/A')

    print("Channel Information:")
    print(f"Title: {title}")
    print(f"Link: {link}")
    print(f"Description: {description}")
    print(f"Language: {language}")
    print(f"Publication Date: {pub_date}")

    print("\nItems:")

    # Extract and print items
    for entry in feed['entries']:
        item_title = entry.get('title', 'N/A')
        item_link = entry.get('link', 'N/A')
        item_description = entry.get('description', 'N/A')
        item_pub_date = entry.get('published', 'N/A')

        print(f"\nTitle: {item_title}")
        print(f"Link: {item_link}")
        print(f"Description: {item_description}")
        print(f"Publication Date: {item_pub_date}")

        # Extract enclosures
        enclosures = entry.get('enclosures', [])
        if enclosures:
            print("Enclosures:")
            for enclosure in enclosures:
                url = enclosure.get('href', 'N/A')
                file_type = enclosure.get('type', 'N/A')
                print(f" - URL: {url}, Type: {file_type}")


def download_csv_zip_files(urls, save_dir):
    """
    Downloads CSV ZIP files from the given URLs to the specified directory.
    """
    os.makedirs(save_dir, exist_ok=True)

    downloaded_files = []
    for url in urls:
        if url.endswith('csv.zip'):
            filename = os.path.basename(urlparse(url).path)
            save_path = os.path.join(save_dir, filename)

            print(f"Downloading {url} to {save_path}")
            response = requests.get(url)
            if response.status_code == 200:
                with open(save_path, 'wb') as f:
                    f.write(response.content)
                downloaded_files.append(save_path)
            else:
                print(f"Failed to download {url}")
    return downloaded_files


def extract_zip_files(zip_files, extract_to):
    """
    Extracts the contents of ZIP files to the specified directory.
    """
    os.makedirs(extract_to, exist_ok=True)

    for zip_file in zip_files:
        print(f"Extracting {zip_file} to {extract_to}")
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            zip_ref.extractall(extract_to)


def download_dataset_from_rss(SAVE_DIRECTORY, RSS_URL):
    """
    Downloads and extracts dataset files from an RSS feed to the specified directory.
    """
    feed = get_rss_feed(RSS_URL)
    enclosure_urls = extract_enclosure_urls(feed)
    downloaded_files = download_csv_zip_files(enclosure_urls, SAVE_DIRECTORY)
    extract_directory = os.path.join(SAVE_DIRECTORY, 'files')
    extract_zip_files(downloaded_files, extract_directory)
    print(f'Succesfully downloaded and extracted files from RSS_URL: {RSS_URL}')

## **EXAMPLE OF RRS FEED CONTENTS** <a class="anchor" id="2"></a>

In [14]:
# Test RSS_URL by printing the information in the RSS feed
RSS_URL = 'https://data.europa.eu/api/hub/search/en/feeds/datasets/cordis-eu-research-projects-under-horizon-europe-2021-2027.rss'
feed = get_rss_feed(RSS_URL)
print_rss_feed(feed)

Getting RSS feed...
Feed title = CORDIS - EU research projects under HORIZON EUROPE (2021-2027) - RSS Feed
Channel Information:
Title: CORDIS - EU research projects under HORIZON EUROPE (2021-2027) - RSS Feed
Link: http://data.europa.eu/data/datasets/cordis-eu-research-projects-under-horizon-europe-2021-2027
Description: n/a
Language: en
Publication Date: Mon, 24 Jun 2024 06:42:04 +0000

Items:

Title: CORDIS - EU research projects under HORIZON EUROPE (2021-2027)
Link: http://data.europa.eu/api/hub/search/datasets/cordis-eu-research-projects-under-horizon-europe-2021-2027
Description: This dataset contains information about projects and their results funded by the European Union under the Horizon Europe framework programme for research and innovation from 2021 to 2027.
   
The dataset is composed of six (6) different sub-set (in different formats):

- HORIZON projects – which includes participating organisations, legal basis information, topic information, project URLs and classificat

## **IMPORT OF DATA** <a class="anchor" id="3"></a>

In [15]:
# Download files from eu_cordis_2007_2013 RRS feed
SAVE_DIRECTORY = '../data/eu_cordis_2007_2013'
RSS_URL = 'https://data.europa.eu/api/hub/search/nl/feeds/datasets/cordisfp7projects.rss'

download_dataset_from_rss(SAVE_DIRECTORY, RSS_URL)

Getting RSS feed...
Feed title = CORDIS — EU-onderzoeksprojecten in het kader van KP7 (2007-2013) - RSS Feed
Downloading https://cordis.europa.eu/data/cordis-fp7projects-csv.zip to ../data/eu_cordis_2007_2013/cordis-fp7projects-csv.zip
Downloading https://cordis.europa.eu/data/cordis-fp7reports-csv.zip to ../data/eu_cordis_2007_2013/cordis-fp7reports-csv.zip
Extracting ../data/eu_cordis_2007_2013/cordis-fp7projects-csv.zip to ../data/eu_cordis_2007_2013/files
Extracting ../data/eu_cordis_2007_2013/cordis-fp7reports-csv.zip to ../data/eu_cordis_2007_2013/files
Succesfully downloaded and extracted files from RSS_URL: https://data.europa.eu/api/hub/search/nl/feeds/datasets/cordisfp7projects.rss


In [16]:
# Download files from eu_cordis_2014_2020 RRS feed
SAVE_DIRECTORY = '../data/eu_cordis_2014_2020'
RSS_URL = 'https://data.europa.eu/api/hub/search/en/feeds/datasets/cordish2020projects.rss'

download_dataset_from_rss(SAVE_DIRECTORY, RSS_URL)

Getting RSS feed...
Feed title = CORDIS - EU research projects under Horizon 2020 (2014-2020) - RSS Feed
Downloading https://cordis.europa.eu/data/cordis-h2020projectDeliverables-csv.zip to ../data/eu_cordis_2014_2020/cordis-h2020projectDeliverables-csv.zip
Downloading https://cordis.europa.eu/data/cordis-h2020reports-csv.zip to ../data/eu_cordis_2014_2020/cordis-h2020reports-csv.zip
Downloading https://cordis.europa.eu/data/cordis-h2020projects-csv.zip to ../data/eu_cordis_2014_2020/cordis-h2020projects-csv.zip
Downloading https://cordis.europa.eu/data/cordis-h2020projectPublications-csv.zip to ../data/eu_cordis_2014_2020/cordis-h2020projectPublications-csv.zip
Downloading https://cordis.europa.eu/data/cordis-h2020projectDeliverables-csv.zip to ../data/eu_cordis_2014_2020/cordis-h2020projectDeliverables-csv.zip
Downloading https://cordis.europa.eu/data/cordis-h2020reports-csv.zip to ../data/eu_cordis_2014_2020/cordis-h2020reports-csv.zip
Downloading https://cordis.europa.eu/data/cordi

In [17]:
# Download files from eu_cordis_2021_2027 RRS feed
SAVE_DIRECTORY = '../data/eu_cordis_2021_2027'
RSS_URL = 'https://data.europa.eu/api/hub/search/en/feeds/datasets/cordis-eu-research-projects-under-horizon-europe-2021-2027.rss'

download_dataset_from_rss(SAVE_DIRECTORY, RSS_URL)

Getting RSS feed...
Feed title = CORDIS - EU research projects under HORIZON EUROPE (2021-2027) - RSS Feed
Downloading https://cordis.europa.eu/data/cordis-HORIZONprojects-csv.zip to ../data/eu_cordis_2021_2027/cordis-HORIZONprojects-csv.zip
Downloading https://cordis.europa.eu/data/cordis-HORIZONreports-csv.zip to ../data/eu_cordis_2021_2027/cordis-HORIZONreports-csv.zip
Downloading https://cordis.europa.eu/data/cordis-HORIZONprojectDeliverables-csv.zip to ../data/eu_cordis_2021_2027/cordis-HORIZONprojectDeliverables-csv.zip
Downloading https://cordis.europa.eu/data/cordis-HORIZONprojectPublications-csv.zip to ../data/eu_cordis_2021_2027/cordis-HORIZONprojectPublications-csv.zip
Downloading https://cordis.europa.eu/data/cordis-HORIZONprojects-csv.zip to ../data/eu_cordis_2021_2027/cordis-HORIZONprojects-csv.zip
Downloading https://cordis.europa.eu/data/cordis-HORIZONreports-csv.zip to ../data/eu_cordis_2021_2027/cordis-HORIZONreports-csv.zip
Downloading https://cordis.europa.eu/data/c