In [0]:
pip install langchain

In [0]:
pip install azure-cosmos==4.9.0

Collecting azure-cosmos==4.9.0
  Obtaining dependency information for azure-cosmos==4.9.0 from https://files.pythonhosted.org/packages/61/dc/380f843744535497acd0b85aacb59565c84fc28bf938c8d6e897a858cd95/azure_cosmos-4.9.0-py3-none-any.whl.metadata
  Downloading azure_cosmos-4.9.0-py3-none-any.whl.metadata (80 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/80.8 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m71.7/80.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:01[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.8/80.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Downloading azure_cosmos-4.9.0-py3-none-any.whl (303 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/303.2 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.2/303.2 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collec

In [0]:
%restart_python or dbutils.library.restartPython()

In [0]:
# Initialize Cosmos DB client and container
url = "cosmosdb-endPoint"
key = "cosmosdb-key"
database_name = "database-name"
bronze_container_name = "bronze"
silver_container_name = "silver"

In [0]:
import json
import logging
from azure.cosmos import CosmosClient, PartitionKey
import uuid
from pyspark.sql.functions import *
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [0]:

client = CosmosClient(url, credential=key)
database = client.get_database_client(database_name)
bronze_container = database.get_container_client(bronze_container_name)

In [0]:
# Retreive the data that are not processed yet
query = "SELECT * FROM c WHERE c.processed = false"
items = bronze_container.query_items(query=query, enable_cross_partition_query=True)


In [0]:
items_list =list(items)

In [0]:
len(items_list)

1126

In [0]:

def process_content(content: str) -> list:
    if content:
        # Define the text splitter
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1100,
            chunk_overlap=150,
            length_function=len,
            add_start_index=True,
        )

        # Process the content
        chunks = text_splitter.create_documents([content])

        # Add end_index for each chunk
        for chunk in chunks:
            start_index = chunk.metadata.get("start_index", 0)
            end_index = start_index + len(chunk.page_content)
            chunk.metadata["end_index"] = end_index

        return chunks

    return []

In [0]:

def create_chunks_list(news_df,news_chunks):
    n=0
    structured_news_chunks = []

    for news in news_df.rdd.collect():

        chunks = news_chunks[n]  

        for data_chunk in chunks:
            # print(data_chunk)
            
            chunk ={
                "id" : str(uuid.uuid4()),
                "news_id":news.id,
                "guid" : news.guid,
                "pub_date":news.pub_date,
                "chunk_text": data_chunk.page_content,
                "start_index":data_chunk.metadata["start_index"] ,
                "end_index":data_chunk.metadata["end_index"],
                "processed":False  
            }
            structured_news_chunks.append(chunk)
    
    return structured_news_chunks




In [0]:
def declare_procced_news(record):
    record["processed"] = True
    return record

In [0]:
if len(items_list)!=0:
    try:

        
        news_df = spark.createDataFrame(items_list)    
        news_df = news_df.na.drop()  

        spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")


            # Define the transformation based on the condition
        news_df = news_df.withColumn(
            "pub_date",
            when(
                col("link").contains("https://www.amazon.science"),
                to_timestamp(unix_timestamp(col("pub_date"), "EEE, dd MMM yyyy HH:mm:ss 'GMT'").cast("timestamp"))
            ).otherwise(
                to_timestamp(unix_timestamp(col("pub_date"), "EEE, dd MMM yyyy HH:mm:ss Z").cast("timestamp"))
            )
        )



        content_list = news_df.select("content").rdd.flatMap(lambda x: x).collect()
        news_chunks = list(map(process_content, content_list))

        structured_news_chunks = create_chunks_list(news_df,news_chunks)


        # store  'chunks' into silver container
        silver_container = database.get_container_client(silver_container_name)
        for record in structured_news_chunks:        
            try:
                record["pub_date"]=  record["pub_date"].isoformat()
                # Insert each updated record into Cosmos DB
                silver_container.upsert_item(body=record)
                logging.info(f"Data inserted into Cosmos DB:")
            except Exception as e:
                logging.error(f"Error inserting data into Cosmos DB: {e}")
        
        #update the processed news
        items_list = list(map(declare_procced_news,items_list))
        for item in items_list:
            try:
            
                # Utiliser upsert pour mettre à jour ou insérer le document
                bronze_container.upsert_item(body=item)

                logging.info(f"Document {item['id']} mis à jour avec succès.")
            except exceptions.CosmosHttpResponseError as e:
                logging.info(f"Erreur lors de la mise à jour du document {item['id']}: {str(e)}")

    except Exception as e:
        logging.error(e)
    
else:
    logging.info("No items to process") 

no items


In [0]:
# structured_news_chunks[91]

In [0]:
# len(structured_news_chunks)

8472

In [0]:
# query = "SELECT * FROM c WHERE"
# silver_container.delete_item({
#     "id": "4",
#     "pub_date": "amkdma",
#     "_rid": "qVwcAOuXoKwBAAAAAAAAAA==",
#     "_self": "dbs/qVwcAA==/colls/qVwcAOuXoKw=/docs/qVwcAOuXoKwBAAAAAAAAAA==/",
#     "_etag": "\"4d0169ad-0000-0e00-0000-675ca1ae0000\"",
#     "_attachments": "attachments/",
#     "_ts": 1734123950
# },partition_key="amkdma")

In [0]:
# items_list[0]

In [0]:
# import datetime
# datetime.datetime(2024, 9, 5, 16, 22, 56).isoformat()

In [0]:
# #update the processed news
# # items_list = list(map(declare_procced_news,items_list))
# for item in items_list:
#     try:
#         item["processed"]=False
#         # Utiliser upsert pour mettre à jour ou insérer le document
#         bronze_container.upsert_item(body=item)

#         logging.info(f"Document {item['id']} mis à jour avec succès.")
#     except exceptions.CosmosHttpResponseError as e:
#         logging.info(f"Erreur lors de la mise à jour du document {item['id']}: {str(e)}")