In [None]:
import requests
import bs4
from datetime import datetime, timedelta
# from dotenv import load_dotenv
import psycopg
import psycopg_binary
# import os
import streamlit as st

# Web Scraping 

### Request the library to make an HTTP request to the webpage ( EXAMPLE )

In [None]:
def scrape_data():
    url = "http://www.example.com"
    response = requests.get(url)
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    print(soup)
    return soup

data = scrape_data()


### Request the library U.Today

In [None]:
def scrape_data():
    url = 'https://u.today/search/node?keys=bitcoin'
    response = requests.get(url)
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    print(soup)
    return(soup)

data = scrape_data()

### Get a list of articles with dates

#### Get today's date

In [None]:
today = datetime.today().strftime('%b,%d,%Y')
today

#### Print all the articles on the page

In [None]:
articles = data.find_all("div", attrs ={"class": "news__item"})

results = []
dates = []
for article in articles:
    title_div = article.find("div", attrs = {"class": "news__item-title"})
    date_div = article.find("div", attrs = {"class":"humble"})

    if title_div and date_div:
        title = title_div.get_text(strip = True)
        date = date_div.get_text(strip = True)
        results.append((date, title))
        dates.append(date)

if not dates:
    print("No articles found")
else:
    latest_date = max(dates)

for date, title in results:
    print(f"{date} – {title}")
if not results:
    print("No articles found for today.")

In [None]:
print(f"Found {len(articles)} articles")

#### Print articles for today or the latest possible date

In [None]:
articles = data.find_all("div", attrs ={"class": "news__item"})

results = []
dates = []

for article in articles:
    title_div = article.find("div", attrs = {"class": "news__item-title"})
    date_div = article.find("div", attrs = {"class":"humble"})

    if title_div and date_div:
        title = title_div.get_text(strip = True)
        date = date_div.get_text(strip = True)
        try:
            date_obj = datetime.strptime(date, "%b %d, %Y - %H:%M") #converting each date string immediately into a datetime object and store in dates list
            results.append((date_obj, title))
            dates.append(date_obj)
        except ValueError:
            print(f"Skipping the article due to unrecognized date format")

#if the list is empty then...
if not dates:
    print("No articles found")
else:
    today = datetime.today().date() #datetime.today() — pobiera aktualną datę i czas (teraz, dokładnie z godziną, minutą, sekundą itp.) 
                                    #.date() — wyciąga tylko część daty, czyli rok, miesiąc i dzień, bez godziny i minut.

# Try to find articles from today - if date=today then for date,time in our results we create a new list of tuples
todays_articles = [(d, t) for d, t in results if d.date() == today]

if todays_articles:
        print("Articles from today:")
        for d, t in todays_articles:
            print(f"{d.strftime('%b %d, %Y - %H:%M')} – {t}")
else:
    # Fallback to most recent date
    latest_date = max(dates)
    recent_articles = [(d, t) for d, t in results if d.date() == latest_date.date()]
    print(f"No articles from today. Showing most recent articles from {latest_date.strftime('%b %d, %Y')}:")
    for d, t in recent_articles:
        print(f"{d.strftime('%b %d, %Y - %H:%M')} – {t}")

### Filter the last 6 Months

In [None]:
last_six_months = today - timedelta(days=6*30)

#### Keep only the articles with date not older than 6 months

In [None]:
last_6months_articles = [(d, t) for d, t in results if d.date() >= last_six_months]

## Store data in an SQL database on Supabase

#### Load and connect to dotenv file

In [None]:
# load_dotenv()
# dbconn = os.getenv("DB_CONN")
dbconn = st.secrets["DBCONN"]

#### Create a table

In [None]:
# import psycopg

# def create_table():
#     conn = psycopg.connect(dbconn)
#     cur = conn.cursor()
#     cur.execute(
#         '''
#             CREATE TABLE IF NOT EXISTS articles(
#                 date TIMESTAMP PRIMARY KEY,
#                 title TEXT
#             );
#         '''
#     )
#     conn.commit()
#     cur.close()
#     conn.close()

# create_table()


#### Insert data into the table

In [None]:
conn = psycopg.connect(dbconn)
cur = conn.cursor()

# executemany is 
cur.executemany(
    '''
        INSERT INTO articles(date, title)
        VALUES (%s, %s)
        ON CONFLICT (date) DO NOTHING;
    ''',
    last_6months_articles
)

conn.commit()
cur.close()
conn.close()

In [None]:
# def drop_table():
#     conn = psycopg.connect(dbconn)
#     cur = conn.cursor()
#     cur.execute("DROP TABLE IF EXISTS articles;")
#     conn.commit()
#     cur.close()
#     conn.close()

In [None]:
# drop_table()