In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ArxivRAG") \
    .getOrCreate()

# Load the JSON file
df = spark.read.json("reduce.json")

# Show schema
# df.printSchema()

# print(df.count())


# Show some sample rows
# df.select("id", "title", "abstract", "categories", "update_date", "versions").show(5, truncate=False)

In [None]:


# Load the JSON file
# df = spark.read.json("../arxiv-metadata-oai-snapshot.json")

# Show schema
# df.printSchema()

# print(df.count())
# df_copy = df.limit(10000)
# df_copy.write.format("json") \
#     .mode("append") \
#     .save("test3.json")

In [None]:
from pyspark.sql.functions import col, concat_ws, lower, regexp_replace, trim
from pyspark.sql.functions import monotonically_increasing_id, col


# Combine title and abstract into a 'document' field
df_cleaned = df.select(
    "id", "title", "abstract", "categories", "versions", "title", "authors"
).withColumn(
    "document",
    concat_ws(" ", col("title"), col("abstract"))
).withColumn(
    "document",
    lower(regexp_replace(col("document"), r"[^a-zA-Z0-9\s]", ""))
).withColumn(
    "document", trim(col("document"))
).withColumn(
    "row_id", monotonically_increasing_id()
).withColumn(
    "year", col("versions")[0]["created"].substr(-17, 4)
)
# Filter out empty documents
df_cleaned = df_cleaned.filter(col("document") != "")

# Optional: Sample 10,000 rows for development
df_sample = df_cleaned.limit(1)
# Show a few processed rows
df_sample.show()

In [None]:
# from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

import pandas as pd

# Convert Spark DataFrame to Pandas
pandas_df = df_sample.select("id", "document", "year", "title", "authors").toPandas()

In [None]:
pandas_df.shape
# pandas_df[pandas_df["year"].apply(lambda x: int(x) > 1960 and int(x) < 2026)].value_counts()

In [None]:
# Load sentence transformer model
# model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [None]:
# Embed the documents
# embeddings = model.encode(pandas_df['document'].tolist(), show_progress_bar=True)

metadata = [{"id": row["id"], "year": row["year"], "title": row["title"], "authors": row["authors"]} for _, row in pandas_df.iterrows()]
# print(metadata, "test")
vectorstore = Chroma.from_texts(pandas_df['document'].tolist(),embedding=embeddings,metadatas = metadata, ids=pandas_df['id'].tolist() ,persist_directory="../chroma_db")


In [None]:

df_sample = df_cleaned.filter(col("row_id") >= 1020).drop("row_id").limit(1)

# # Show a few processed rows
df_sample.select("id", "categories", "document", "year", "title", "authors").show(5, truncate=150)

In [None]:
# Convert Spark DataFrame to Pandas
pandas_df = df_sample.select("id", "document", "year", "title", "authors").toPandas()


metadata = [{"id": row["id"], "year": row["year"], "title": row["title"], "authors": row["authors"]} for _, row in pandas_df.iterrows()]
vectorstore.add_texts(pandas_df['document'].tolist(),embedding=embeddings,metadatas = metadata, ids=pandas_df['id'].tolist())

In [None]:
retriever = vectorstore.as_retriever()

# test query
query = "field"
retrieved_docs = retriever.get_relevant_documents(query)
i = 0
for doc in retrieved_docs:
    print(i, doc.page_content)
    print("\n")
    i +=1