In [4]:
# perform a GET http request to this website https://web.archive.org/web/20240102004317/https://www.ilpost.it/italia/
import requests
from bs4 import BeautifulSoup
import time

url = 'https://web.archive.org/web/20240102004317/https://www.ilpost.it/italia/'

response = requests.get(url)

In [5]:
response.content

b'<!DOCTYPE html><html lang="it"><head><script type="text/javascript" src="https://web-static.archive.org/_static/js/bundle-playback.js?v=t1Bf4PY_" charset="utf-8"></script>\n<script type="text/javascript" src="https://web-static.archive.org/_static/js/wombat.js?v=txqj7nKC" charset="utf-8"></script>\n<script>window.RufflePlayer=window.RufflePlayer||{};window.RufflePlayer.config={"autoplay":"on","unmuteOverlay":"hidden"};</script>\n<script type="text/javascript" src="https://web-static.archive.org/_static/js/ruffle/ruffle.js"></script>\n<script type="text/javascript">\n  __wm.init("https://web.archive.org/web");\n  __wm.wombat("https://www.ilpost.it/italia/","20240102004317","https://web.archive.org/","web","https://web-static.archive.org/_static/",\n\t      "1704156197");\n</script>\n<link rel="stylesheet" type="text/css" href="https://web-static.archive.org/_static/css/banner-styles.css?v=S1zqJCYt" />\n<link rel="stylesheet" type="text/css" href="https://web-static.archive.org/_static

In [6]:
class Article:
    def __init__(self, title, link, domain):
        self.title = title
        self.link = link
        self.domain = domain
        self.date = None
        
        # Content
        self.title = None
        self.subtitle = None
        self.content = None
        self.tags = []
        
        self.extract_date()
        self.extract_content()
    
    def extract_content(self):
        if self.domain == "www.ilpost.it":
            response = requests.get(self.link)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract title
            title = soup.find('h1').text
            self.title = title

            # Extract subtitle
            subtitle = soup.find('h2').text
            self.subtitle = subtitle

            # Extract article content
            article_content = soup.find('div', {'id': 'singleBody'}).get_text(strip=True)
            self.content = article_content
            
            # Extract tags
            tags_div = soup.find('div', {'class': 'index_art_tag__pP6B_'})
            tags = [a.text for a in tags_div.find_all('a')]
            self.tags = tags

            time.sleep(1)
            
    def extract_date(self):
        # extract date from the link: https://web.archive.org/web/20240102004317/https://www.ilpost.it/2024/01/01/fine-reddito-di-cittadinanza/
        date = self.link.split('/')[4]
        self.date = date

    def __str__(self):
        return f"Title: {self.title}\nLink: {self.link}\nDomain: {self.domain}\nDate: {self.date}\nSubtitle: {self.subtitle}\nContent: {self.content}\nTags: {self.tags}"

In [7]:
parsed_articles = []

soup = BeautifulSoup(response.content, 'html.parser')

articles = soup.find_all('article', {'class': '_taxonomy-item_q6jgq_1 _opener_q6jgq_14'})

for article in articles:
    # TODO: remove this condition
    if len(parsed_articles) > 2:
        break
    
    title = article.find('h2', {'class': '_article-title_1aaqi_4'}).text
    link = article.find('a')['href']
    parsed_acrticle = Article(title, link, "www.ilpost.it")
    # print(parsed_acrticle.__str__() + "\n\n")
    parsed_articles.append(parsed_acrticle)
    

In [8]:
parsed_articles[1].title

'Il sequestro incomprensibile e il feroce omicidio di un bambino di 17 mesi'

In [9]:
# trasnform the parsed articles into a pandas dataframe
import pandas as pd

data = {
    'title': [article.title for article in parsed_articles],
    'link': [article.link for article in parsed_articles],
    'domain': [article.domain for article in parsed_articles],
    'date': [article.date for article in parsed_articles],
    'subtitle': [article.subtitle for article in parsed_articles],
    'content': [article.content for article in parsed_articles],
    'tags': [article.tags for article in parsed_articles]
}

df = pd.DataFrame(data)
df

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,title,link,domain,date,subtitle,content,tags
0,È finito il reddito di cittadinanza,https://web.archive.org/web/20240102004317/htt...,www.ilpost.it,20240102004317,Fu introdotto nel 2019 dal primo governo Conte...,Caricamento playerIl 1° gennaio del 2024 è ent...,"[Assegno di inclusione, reddito di cittadinanza]"
1,Il sequestro incomprensibile e il feroce omici...,https://web.archive.org/web/20240102004317/htt...,www.ilpost.it,20240102004317,Tommaso Onofri venne portato via da casa sua i...,Il 2 marzo 2006 due persone con il volto coper...,"[casalbaroncolo, parma, Tommaso Onofri]"
2,Cosa ha detto Sergio Mattarella nel suo discor...,https://web.archive.org/web/20240102004317/htt...,www.ilpost.it,20240102004317,Il presidente della Repubblica ha parlato dell...,Nella serata del 31 dicembre il presidente del...,"[discorso fine anno, discorso mattarella, serg..."


# Link same news:
## LLM generalize the notice given all the titles for each source (e.g. war in Romania);

### LLM Class setup

In [10]:
class Prompt:
    """
    A class used to represent a Prompt

    Attributes
    ----------
    boilerplate : dict
        a dictionary representing the boilerplate for the prompt

    Methods
    -------
    __init__(self, content: str)
        Initializes the Prompt object with the given content.
    """

    def __init__(self, content: str) -> None:
        """
        Parameters
        ----------
        content : str
            The content of the prompt
        """
        self.content = content

        # if self.boilerplate is None:
        self.boilerplate = {
            "contents": [{
                "parts": [{
                    "text": content
                }]
            }],
            "generationConfig": {
                "temperature": 0
            }
        }

    def get_prompt(self) -> str:
        """
        Returns the content of the prompt.

        Returns
        -------
        str
            The content of the prompt
        """
        if self.boilerplate is None:
            return None
        if self.content is None:
            raise ValueError("Prompt content is None")

        return self.boilerplate

In [11]:
import requests
from typing import Dict, Any, Optional, List

class LLM:
    """
    A class used to represent a LLM

    ...

    Attributes
    ----------
    endpoint : str
        the endpoint for the LLM
    endpoint_with_api_key : str
        the endpoint for the LLM with the API key
    headers : dict
        the headers for the LLM

    Methods
    -------
    __init__(self, endpoint: str, method: str, headers: Optional[dict] = None, api_key: Optional[str] = None)
        Initializes the LLM object with the given endpoint, method, headers, and API key.
    generate(self, prompt: str) -> Optional[dict]
        Generates a response from the LLM given a prompt.
    """

    def __init__(self, endpoint: str = "https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?", headers: Optional[dict] = {
        "Content-Type": "application/json"
    }, api_key: Optional[str] = None) -> None:
        """
        Parameters
        ----------
        endpoint : str
            The endpoint for the LLM
        headers : Optional[dict]
            The headers for the LLM
        api_key : Optional[str]
            The API key for the LLM
        """
        self.endpoint = endpoint
        self.api_key = api_key
        self.endpoint_with_api_key = endpoint + "key=" + self.api_key # Specific for Gemini
        self.headers = headers

    def generate(self, prompt: Prompt) -> Optional[dict]:
        """
        Generates a response from the LLM given a prompt.

        Parameters
        ----------
        prompt : str
            The prompt for the LLM

        Returns
        -------
        dict
            The response from the LLM
        """
        try:
            response = requests.post(self.endpoint_with_api_key, headers=self.headers, json=prompt.get_prompt())
            return response.json()
        except Exception as e:
            print(e)
            return None

    def get_endpoint_with_api_key(self) -> str:
        """
        Returns the endpoint with the API key.

        Returns
        -------
        str
            The endpoint with the API key
        """
        return self.endpoint_with_api_key

In [12]:
def get_text_from_response(response_dict: dict) -> List[str]:
    """
    Extracts the generated text from the JSON response of the generative language API.

    Parameters
    ----------
    response_dict (dict): The JSON response as a dict.

    Returns
    -------
    texts (List[str]): A list of generated texts from the response.
    """
    # cover the case where the dict is None or empty
    if not response_dict:
        return [""]

    candidates = response_dict.get("candidates", [])

    # Initialize an empty list to store the texts
    texts = []

    for candidate in candidates:

        content = candidate.get("content", {})

        parts = content.get("parts", [])
        # Get the first part from the list (assuming there is only one part)
        part = parts[0] if parts else {}

        text = part.get("text", "")
        texts.append(text)

    if not texts:
        return [""]
    # Return the list of texts
    return texts

In [13]:
import os

os.environ['API_KEY'] = 'AIzaSyAqQRlPN9fXqmAKoZH--hSe72pL8irdJ6o'
gemini = LLM(api_key=os.environ['API_KEY'])

In [14]:
for index, row in df.iterrows():
    # Create a prompt for the title
    generalize_prompt = Prompt(f"""Given a notice title, summarize and generalize the content in a few words.

    Example:
    Input: Cosa ha detto Sergio Mattarella nel suo discorso di fine anno
    Output: Discorso fine anno, Sergio Mattarella

    Input: {row.title}""")
    
    # Generalizing the title with the LLM
    output_json = gemini.generate(generalize_prompt)
    output_text = get_text_from_response(output_json)[0]

    # Append the generalized title to the dataframe
    df.at[index, 'generalized_title'] = output_text if output_text else None

In [15]:
df

Unnamed: 0,title,link,domain,date,subtitle,content,tags,generalized_title
0,È finito il reddito di cittadinanza,https://web.archive.org/web/20240102004317/htt...,www.ilpost.it,20240102004317,Fu introdotto nel 2019 dal primo governo Conte...,Caricamento playerIl 1° gennaio del 2024 è ent...,"[Assegno di inclusione, reddito di cittadinanza]",
1,Il sequestro incomprensibile e il feroce omici...,https://web.archive.org/web/20240102004317/htt...,www.ilpost.it,20240102004317,Tommaso Onofri venne portato via da casa sua i...,Il 2 marzo 2006 due persone con il volto coper...,"[casalbaroncolo, parma, Tommaso Onofri]",
2,Cosa ha detto Sergio Mattarella nel suo discor...,https://web.archive.org/web/20240102004317/htt...,www.ilpost.it,20240102004317,Il presidente della Repubblica ha parlato dell...,Nella serata del 31 dicembre il presidente del...,"[discorso fine anno, discorso mattarella, serg...",


In [18]:
# add id column to the dataframe
df['id'] = range(1, 1 + len(df))
df

# move the id column to the first position
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]
df

Unnamed: 0,id,title,link,domain,date,subtitle,content,tags,generalized_title
0,1,È finito il reddito di cittadinanza,https://web.archive.org/web/20240102004317/htt...,www.ilpost.it,20240102004317,Fu introdotto nel 2019 dal primo governo Conte...,Caricamento playerIl 1° gennaio del 2024 è ent...,"[Assegno di inclusione, reddito di cittadinanza]",
1,2,Il sequestro incomprensibile e il feroce omici...,https://web.archive.org/web/20240102004317/htt...,www.ilpost.it,20240102004317,Tommaso Onofri venne portato via da casa sua i...,Il 2 marzo 2006 due persone con il volto coper...,"[casalbaroncolo, parma, Tommaso Onofri]",
2,3,Cosa ha detto Sergio Mattarella nel suo discor...,https://web.archive.org/web/20240102004317/htt...,www.ilpost.it,20240102004317,Il presidente della Repubblica ha parlato dell...,Nella serata del 31 dicembre il presidente del...,"[discorso fine anno, discorso mattarella, serg...",


## LLM link all the notices related to the general title.

In [17]:
Prompt(f"""Given a list of notice titles, group together all the notice titles that refer to the same notice.

Example:
Input:
<root>
  <row>
    <id>1</id>
    <text>Cosa ha detto Sergio Mattarella nel suo discorso di fine anno</text>
  </row>
  <row>
    <id>4</id>
    <text>Mattarella e il discorso di fine anno 2023</text>
  </row>
  <row>
    <id>2</id>
    <text>È finito il reddito di cittadinanza</text>
  </row>
</root>

Output:
<root>
  <row>
    <group>1</group>
    <title>Refers to Sergio Mattarella’s end-of-year speech</title>
    <notices>
      <id>1</id>
      <text>Cosa ha detto Sergio Mattarella nel suo discorso di fine anno</text>
    </notices>
    <notices>
      <id>4</id>
      <text>Mattarella e il discorso di fine anno 2023</text>
    </notices>
  </row>
  <row>
    <group>2</group>
    <title>Refers to the end of the citizenship income</title>
    <notices>
      <id>2</id>
      <text>È finito il reddito di cittadinanza</text>
    </notices>
  </row>
</root>

    Input: {[row.id + row.title]}""")

{
{id: 1, text: "Cosa ha detto Sergio Mattarella nel suo discorso di fine anno"},
{id: 4, text: "Mattarella e il discorso di fine anno 2023"},
{id: 2, text: "È finito il reddito di cittadinanza"},
{id: 12, text: "Mattarella pronto al discorso di fine anno"},
{id: 7, text: "Lanciato Odysseus, Il nuovo lander dei privati diretto alla Luna"},
{id: 73, text: "Messaggio di Fine Anno del Presidente della Repubblica"}
}

SyntaxError: invalid syntax (3284119782.py, line 9)

# Storage: inserting dataset in a NoSQL DB (TinyDB)

In [19]:
!pip install tinydb

Collecting tinydb
  Downloading tinydb-4.8.0-py3-none-any.whl.metadata (6.2 kB)
Downloading tinydb-4.8.0-py3-none-any.whl (24 kB)
Installing collected packages: tinydb
Successfully installed tinydb-4.8.0


In [20]:
from tinydb import TinyDB, Query

db = TinyDB('db.json')

In [23]:
db.insert({'tysadasde': 'apple', 'count': 7})

2

In [24]:
db.all()

[{'type': 'apple', 'count': 7}, {'tysadasde': 'apple', 'count': 7}]