In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ArxivRAG") \
    .getOrCreate()

# Load the JSON file
df = spark.read.json("../arxiv-metadata-oai-snapshot.json")

# Show schema
df.printSchema()

# Show some sample rows
df.select("id", "title", "abstract", "categories").show(5, truncate=False)

                                                                                

root
 |-- abstract: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- authors_parsed: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- categories: string (nullable = true)
 |-- comments: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- id: string (nullable = true)
 |-- journal-ref: string (nullable = true)
 |-- license: string (nullable = true)
 |-- report-no: string (nullable = true)
 |-- submitter: string (nullable = true)
 |-- title: string (nullable = true)
 |-- update_date: string (nullable = true)
 |-- versions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- created: string (nullable = true)
 |    |    |-- version: string (nullable = true)

+---------+------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------

In [8]:
from pyspark.sql.functions import col, concat_ws, lower, regexp_replace, trim

# Combine title and abstract into a 'document' field
df_cleaned = df.select(
    "id", "title", "abstract", "categories"
).withColumn(
    "document",
    concat_ws(" ", col("title"), col("abstract"))
).withColumn(
    "document",
    lower(regexp_replace(col("document"), r"[^a-zA-Z0-9\s]", ""))
).withColumn(
    "document", trim(col("document"))
)

# Filter out empty documents
df_cleaned = df_cleaned.filter(col("document") != "")

# Optional: Sample 10,000 rows for development
# df_sample = df_cleaned.limit(10000)
df_sample = df_cleaned.limit(10)

# Show a few processed rows
df_sample.select("id", "categories", "document").show(5, truncate=150)

+---------+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------+
|       id|     categories|                                                                                                                                              document|
+---------+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------+
|0704.0001|         hep-ph|calculation of prompt diphoton production cross sections at tevatron and\n  lhc energies   a fully differential calculation in perturbative quantum...|
|0704.0002|  math.CO cs.CG|sparsitycertifying graph decompositions   we describe a new algorithm the kellpebble game with colors and use\nit obtain a characterization of the ...|
|0704.0003| physics.gen-ph|the evolution of the earthmoon system based on the dark matter field\n  fluid 

In [9]:
# from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

import pandas as pd

# Convert Spark DataFrame to Pandas
pandas_df = df_sample.select("id", "document").toPandas()

In [10]:
# Load sentence transformer model
# model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [11]:
# Embed the documents
# embeddings = model.encode(pandas_df['document'].tolist(), show_progress_bar=True)
vectorstore = Chroma.from_texts(pandas_df['document'].tolist(), embeddings, persist_directory="../chroma_db")