# First write to DB

In [1]:
import polars as pl
import pandas as pd
from pymongo import MongoClient
import re 
import time
def view_string(long_string, chunk_size=100):     
    return [long_string[i:i+chunk_size] for i in range(0, len(long_string), chunk_size)]


In [2]:
%cd /home/sebacastillo/willow

/home/sebacastillo/willow


In [3]:
news = pl.read_csv(   
    'output/news.csv',
    dtypes={"content_hash": pl.UInt64},     
)
newsner = pl.read_csv(   
    'output/newsner.csv',
    dtypes={"content_hash": pl.UInt64},     
)

In [4]:
news.head(1)

index,topic,date_extract,date_article,content,portal,link,link_sim_score,title,summary,summary_llm,summary_sim_score,authors,state,city,content_hash,content_nchar
i64,str,str,str,str,str,str,f64,str,str,str,f64,str,str,str,u64,i64
1,"""narcotráfico""","""2023-08-16""","""2023-08-16""","""Seguinos Por L…","""https://www.ar…","""https://www.la…",0.561929,"""Condenaron a s…","""Brian está det…","""Seis personas …",0.5241,"""n-a""","""Santa Fe""","""Rosario""",9259959111029284149,4543


In [5]:
news = news.with_columns(
    pl.concat_str(
        [
            pl.col('state'),
            pl.col('city'),
            pl.col("title"),
            pl.col("summary_llm"),
        ],
        separator=" ",
    ).alias("tit_summary"),
)

In [6]:
news.head(3)

index,topic,date_extract,date_article,content,portal,link,link_sim_score,title,summary,summary_llm,summary_sim_score,authors,state,city,content_hash,content_nchar,tit_summary
i64,str,str,str,str,str,str,f64,str,str,str,f64,str,str,str,u64,i64,str
1,"""narcotráfico""","""2023-08-16""","""2023-08-16""","""Seguinos Por L…","""https://www.ar…","""https://www.la…",0.561929,"""Condenaron a s…","""Brian está det…","""Seis personas …",0.5241,"""n-a""","""Santa Fe""","""Rosario""",9259959111029284149,4543,"""Santa Fe Rosar…"
2,"""narcotráfico""","""2023-08-16""","""2023-08-16""","""Escuadrón 1 “R…","""https://www.ar…","""http://www.dia…",0.476911,"""Encuentran 32 …","""na""","""La Fiscalía Fe…",0.5945,"""Diario El Oran…","""Salta""","""Orán""",6046667906668190206,707,"""Salta Orán Enc…"
3,"""narcotráfico""","""2023-08-16""","""2023-08-16""",""". Compartir en…","""https://www.ar…","""https://www.ar…",0.526558,"""Prefectura sec…","""Efectivos de l…","""La Prefectura …",0.5717,"""n-a""","""Argentina""","""Argentina""",12244461015022324118,657,"""Argentina Arge…"


In [7]:
news['tit_summary'].to_list()[0]

'Santa Fe Rosario Condenaron a seis personas por venta de drogas, entre ellos un sobrino del "Rengo" Insaurralde Seis personas condenadas por el ‘Clan Insaurralde’ por tráfico y tenencia de estupefacientes. La jueza Elena Beatriz Dilario, miembro del Tribunal Oral Federal 3, homologó el acuerdo al que llegaron las partes'

In [8]:
news.columns

['index',
 'topic',
 'date_extract',
 'date_article',
 'content',
 'portal',
 'link',
 'link_sim_score',
 'title',
 'summary',
 'summary_llm',
 'summary_sim_score',
 'authors',
 'state',
 'city',
 'content_hash',
 'content_nchar',
 'tit_summary']

In [9]:
news = (
    news.with_columns([
        pl.col("content_hash").cast(pl.Utf8)
    ])    
)

In [10]:
news = news.to_pandas()

# Write to Mongo

In [11]:
client = MongoClient('mongodb://localhost:27017/')
db = client['wdocuments']
collection = db['news'] # export to mongo collection news

In [12]:
news.reset_index(inplace=True)
news = news.to_dict("records") # Change to dict

In [13]:
collection.insert_many(news)

<pymongo.results.InsertManyResult at 0x7fc6bc265bb0>

# embedings

In [20]:
n_docs = len(news)

In [18]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [21]:
for doc in collection.find({'tit_summary':{"$exists": True}}).limit(n_docs):
	doc['article_embed'] = model.encode(doc['tit_summary']).tolist()
	collection.replace_one({'_id': doc['_id']}, doc)

# Compute index partially

In [14]:
from pymongo import MongoClient
from sentence_transformers import SentenceTransformer


def compute_embeddings(text):
    
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

    return model.encode(text)

def process_documents(collection, start_index, end_index):
    
    # 'index' field name that represents the index in documents
    query = {'index': {'$gte': start_index, '$lte': end_index}}

    # Find the documents within the specified range
    documents = collection.find(query)

    for document in documents:
        # Extract the text you want to embed
        text = document['tit_summary']
        
        # Compute the embeddings
        embeddings = compute_embeddings(text)
        
        # Update the document with the embeddings
        update_query = {'_id': document['_id']}
        new_values = {'$set': {'embeddings': embeddings.tolist()}}
        collection.update_one(update_query, new_values)


    


In [15]:
process_documents(collection, 1,5)

### Funcionó

In [16]:
process_documents(collection, 1,5)

### Funcionó, si ya tiene embedding los vuelve a calcular pero no falla

In [18]:
process_documents(collection, 1,44) 

# Refactoreo de código para producción

In [None]:
from pymongo import MongoClient
from sentence_transformers import SentenceTransformer

def compute_embeddings(text, model):
    try:
        return model.encode(text)
    except Exception as e:
        print(f"An error occurred while computing embeddings: {e}")
        return None

def process_documents(collection, start_index, end_index, model):
    try:
        # 'index' field name that represents the index in documents
        query = {'index': {'$gte': start_index, '$lte': end_index}}

        # Find the documents within the specified range
        documents = collection.find(query)

        for document in documents:
            # Extract the text you want to embed
            text = document['tit_summary']

            # Compute the embeddings
            embeddings = compute_embeddings(text, model)

            if embeddings is not None:
                # Update the document with the embeddings
                update_query = {'_id': document['_id']}
                new_values = {'$set': {'embeddings': embeddings.tolist()}}
                collection.update_one(update_query, new_values)
    except Exception as e:
        print(f"An error occurred while processing documents: {e}")




In [None]:
# Usage example
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
client = MongoClient('mongodb://localhost:27017/')
db = client['your_database']
collection = db['your_collection']
process_documents(collection, start_index=23, end_index=30, model=model)