In [2]:
import requests
from bs4 import BeautifulSoup

In [5]:
response = requests.get("https://www.lrt.lt/en/news-in-english")
doc = BeautifulSoup(response.content)

In [6]:
items = doc.select(".news")
len(items)

42

In [10]:
articles = []
for item in items:
    url = item.select_one('h3').text
    headline = item.select_one('a')['href']
    img = item.select_one('img').get('data-src', None)
    article = {
        'url': url,
        'headline': headline,
        'img_path': img
    }
    articles.append(article)
len(articles)

42

In [11]:
import pandas as pd

df = pd.DataFrame(articles)
df.head()

Unnamed: 0,url,headline,img_path
0,Centuries-old wooden house in Vilnius shines a...,/en/news-in-english/19/2049374/centuries-old-w...,/img/2022/01/25/1181710-335298-150x84.jpg
1,Lithuania to get its first Michelin star? Rest...,/en/news-in-english/19/2050454/lithuania-to-ge...,/img/2020/08/28/711416-414866-150x84.jpg
2,Lithuanian PM voices confidence in defence min...,/en/news-in-english/19/2050571/lithuanian-pm-v...,/img/2023/02/18/1451044-637891-150x84.jpg
3,"Lithuania deems 1,164 Belarusian and Russian n...",/en/news-in-english/19/2050524/lithuania-deems...,/img/2022/03/01/1207094-733403-150x84.jpg
4,Vilnius ex-mayor Šimašius returns to private s...,/en/news-in-english/19/2050519/vilnius-ex-mayo...,/img/2023/04/17/1491828-404400-150x84.jpg


In [13]:
#approach 1: simple single csv file
df.to_csv("current_headlines.csv", index=False)

In [14]:
#approach 2: save a different file for each new scrape
#NOTE: should use this only for daily scrape, 
#not multiple times a day
#create a folder called "data"

import os
os.makedirs("data", exist_ok=True)

In [18]:
#naming the file name with the date it was scraped
#so it's easier to keep track + browse

from datetime import datetime
date_string = datetime.now().strftime("%Y-%m-%d")
filepath = f"data/{date_string}.csv"
filepath
df.to_csv(filepath, index=False)

In [19]:
#approach 3: appending to existing csv

df['scrape_date'] = datetime.now().strftime("%Y-%m-%d")
df.head()

Unnamed: 0,url,headline,img_path,scrape_date
0,Centuries-old wooden house in Vilnius shines a...,/en/news-in-english/19/2049374/centuries-old-w...,/img/2022/01/25/1181710-335298-150x84.jpg,2023-08-05
1,Lithuania to get its first Michelin star? Rest...,/en/news-in-english/19/2050454/lithuania-to-ge...,/img/2020/08/28/711416-414866-150x84.jpg,2023-08-05
2,Lithuanian PM voices confidence in defence min...,/en/news-in-english/19/2050571/lithuanian-pm-v...,/img/2023/02/18/1451044-637891-150x84.jpg,2023-08-05
3,"Lithuania deems 1,164 Belarusian and Russian n...",/en/news-in-english/19/2050524/lithuania-deems...,/img/2022/03/01/1207094-733403-150x84.jpg,2023-08-05
4,Vilnius ex-mayor Šimašius returns to private s...,/en/news-in-english/19/2050519/vilnius-ex-mayo...,/img/2023/04/17/1491828-404400-150x84.jpg,2023-08-05


In [25]:
#if it exists, open it
#if it doesn't exist, create a blank dataframe
try: 
    existing_df = pd.read_csv("always-updated.csv")
except:
    existing_df = pd.DataFrame([])
existing_df.head()

In [27]:
#combine old and new dataframe

combined = pd.concat([df,existing_df], ignore_index=True)
combined.to_csv("always-updated.csv", index=False)