In [3]:
import requests
import pandas as pd
import os
from bs4 import BeautifulSoup
from urllib import parse

cols = ['title', 'author', 'pubDate', 'permalink', 'guid', 'description', 'content']

def init_archive():
    df = pd.DataFrame(columns = cols)
    os.makedirs("data", exist_ok=True)
    df.to_csv("data/channel-futures.csv")

def load_csv_archive():
    return pd.read_csv("data/channel-futures.csv")

def save_csv_archive(df):
    os.makedirs("data", exist_ok=True)
    df.to_csv("data/channel-futures.csv", index=False)

def get_source(url):
    try:
        r = requests.get(url)
        return r
    except Exception as e:
        print(e)

def get_feed(url, existing_df):
    response = get_source(url)
    df = pd.DataFrame(columns = cols)
    with response as r:
        soup = BeautifulSoup(r.content, features='xml')
        items = soup.findAll('item')
        for item in items:
            title = item.find("title").text
            author = item.find("dc:creator").text
            pubDate = pd.Timestamp(item.find("pubDate").text)
            permalink = item.find("guid").text
            guid = parse.parse_qs(parse.urlparse(item.find("guid").text).query)['p'][0]
            description = item.find("description").text
            content = item.find("content:encoded").text
            if int(guid) not in existing_df['guid'].values:
                row = pd.DataFrame([{ "title": title, "author": author, "pubDate": pubDate, "permalink": permalink, "guid": guid, "description": description, "content": content }])
                df = pd.concat([df, row], ignore_index=True)
            else:
                continue

    return df

# init_archive() # needed for first run

existing_df = load_csv_archive()
existing_entries_count = existing_df.shape[0]
print(f"Existing Entries Count: {existing_entries_count}")
url = "https://www.channelfutures.com/feed"
new_entry_count = 0
df = get_feed(url, existing_df)

if not df.empty:
    new_entry_count = df.shape[0]
    new_archive = pd.concat([existing_df, df], ignore_index=True)
    save_csv_archive(new_archive)
    
print(f"Added {new_entry_count} Entries")

Existing Entries Count: 51
Added 4 Entries
