In [3]:
! pip install beautifulsoup4
! pip install tqdm
! pip install pandas Pyarrow 
! pip install tensorflow tensorflow_hub tensorflow_text

### scrape all the news form the post homepage

In [4]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from typing import List, Dict, Optional
import time
from tqdm import tqdm

df = None  # Global variable to store the DataFrame

def scrape_news(url: str) -> List[Dict[str, Optional[str]]]:
    """
    Scrape news data from a given website.

    Parameters:
    url (str): The URL of the website to scrape.

    Returns:
    List[Dict[str, Optional[str]]]: A list of dictionaries containing the scraped news data.
    """
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return []

    soup = BeautifulSoup(response.content, "html.parser")
    news_inner_list = soup.find_all(class_="news__inner")
    news_data = []

    for news_inner in news_inner_list:
        title_element = news_inner.find(class_="news__title")
        title = title_element.text.strip() if title_element else None

        date_element = news_inner.find(class_="news__date-day")
        date = date_element.text.strip() if date_element else None

        author_element = news_inner.find(class_="news__author")
        author = author_element.text.strip() if author_element else None

        link_element = news_inner.find(class_="news__title").find('a')
        link = link_element['href'] if link_element else None

        news_data.append({'title': title, 'date': date, 'author': author, 'link': link})

    return news_data

def parse_article(link: str) -> Optional[Dict[str, Optional[str]]]:
    """
    Parse an article from a given link.

    Parameters:
    link (str): The URL of the article to parse.

    Returns:
    Optional[Dict[str, Optional[str]]]: A dictionary containing the parsed article data, or None if the request was unsuccessful.
    """
    response = requests.get(link)
    if response.status_code != 200:
        return None

    soup = BeautifulSoup(response.content, "html.parser")
    title_element = soup.find(class_="news__title")
    title = title_element.text.strip() if title_element else None

    date_element = soup.find(class_="news__date-day")
    date = date_element.text.strip() if date_element else None

    author_element = soup.find(class_="news__author")
    author = author_element.text.strip() if author_element else None

    content_element = soup.find(class_="news__content")
    content = content_element.text.strip() if content_element else None

    a_tag = soup.find('a', rel='category tag')
    tag = a_tag.text if a_tag else None

    article_data = {
        'title': title,
        'date': date,
        'author': author,
        'content': content,
        'tag': tag
    }

    return article_data


In [14]:
df = None
url = "https://web.archive.org/web/20240101002406/https://www.open.online/"
news_data = scrape_news(url)
parsed_articles = []

try:
    for news in tqdm(news_data, desc="Parsing articles"):
        article_data = parse_article(news['link'])
        if article_data:
            article_data['site'] = 'www.open.online'
            parsed_articles.append(article_data)
        time.sleep(3) 
        # TODO: implement an exponential backoff strategy to handle failed requests
except Exception as e:
    print(f"An exception occurred: {str(e)}")
    df = pd.DataFrame(parsed_articles)

if df is None:
    df = pd.DataFrame(parsed_articles)

print(df)

KeyboardInterrupt: 

In [7]:
df_news = pd.DataFrame(parsed_articles)
df_news

Unnamed: 0,title,date,author,content,tag,site
0,Mattarella su Ucraina e Gaza: «La pace va cost...,31 Dicembre 2023,di Redazione,"Guerra, violenza sulle donne, lavoro. Sono que...",POLITICA,www.open.online


In [8]:
df_day = df_news[df_news['date'] == '31 Dicembre 2023']
df_day

Unnamed: 0,title,date,author,content,tag,site
0,Mattarella su Ucraina e Gaza: «La pace va cost...,31 Dicembre 2023,di Redazione,"Guerra, violenza sulle donne, lavoro. Sono que...",POLITICA,www.open.online


### embedding the news in the post homepage

In [9]:
# Import TensorFlow and TensorFlow Hub
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import numpy as np
import sklearn
import sklearn.metrics as sk_metrics
import sklearn.metrics.pairwise as sk_pairwise

# Load the universal sentence encoder multilingual module
module_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual/3'
model = hub.load(module_url)

2024-02-17 21:23:07.646675: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-17 21:23:07.678901: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-17 21:23:07.678936: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-17 21:23:07.680190: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-17 21:23:07.685910: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-17 21:23:07.686764: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [46]:
def embed_text(text_list: List[str]) -> tf.Tensor:
    """Generate the embedding of a list of texts using the model.

    Parameters:
        text_list (List[str]): The list of texts to transform into embeddings.

    Returns:
        tf.Tensor: The embedding tensor produced by the model.
    """
    # Convert the text list into a string tensor
    return model(text_list)

def similarity_text(text1: str, text2: str) -> float:
    """Compute the similarity between two texts using the dot product between their embeddings.

    Parameters:
        text1 (str): The first text to compare.
        text2 (str): The second text to compare.

    Returns:
        float: The similarity between the two texts, ranging from -1 to 1.
    """
    # Compute the embeddings of the two texts
    
    embedding1 = embed_text([text1])
    embedding2 = embed_text([text2])
    # Check if the embeddings are the same
    if np.array_equal(embedding1, embedding2):
        return 1.0

    sim = 1 - np.arccos(
        sk_pairwise.cosine_similarity(embedding1, embedding2)) / np.pi
    # Return the similarity
    return sim.item()

In [49]:
print(similarity_text('bello', 'cane'))
print(similarity_text('bello', 'brutto'))
print(similarity_text('bello', 'bello asdfadf e'))

0.6313443183898926
0.7444989681243896
0.7440999150276184


In [16]:
labels = df_day['title'].iloc[:3].values.tolist()
text1 =labels[0]
text2 = labels[1]

print(similarity_text([text1], [text2]))

IndexError: list index out of range