## Install the required packages

In [None]:
! pip install -q pandas sentence_transformers
! pip install -q chromadb-client
! pip install -q uuid
! pip install -q python-dotenv

## Import the necessary libraries

In [None]:
# All the import statements are in the first cell of the notebook
import os
import json
import uuid
import chromadb
import numpy as np
import pandas as pd
from time import sleep
from dotenv import load_dotenv
from chromadb import Settings, utils
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

load_dotenv()

## Setup chromadb connection to collection

In [None]:
# Setup the environment variables

MODEL_NAME=os.environ.get('MODEL_NAME', 'all-distilroberta-v1')
CHROMA_HOST=os.environ.get('CHROMA_HOST', 'localhost')
CHROMA_PORT=int(os.environ.get('CHROMA_PORT', '8000'))
CHROMA_SSL=int(os.environ.get('CHROMA_SSL', '0')) == 1
COLLECTION_NAME=os.environ.get('COLLECTION_NAME', 'test-collection')


client = chromadb.HttpClient(host=CHROMA_HOST, port=CHROMA_PORT, ssl=CHROMA_SSL)

sentence_transformer_ef = SentenceTransformerEmbeddingFunction(
    model_name=MODEL_NAME
)

collection = client.get_or_create_collection(COLLECTION_NAME, embedding_function=sentence_transformer_ef)

## Read the pericopes and bible verses CSVs

In [None]:
pericopes_df = pd.read_csv('./files/pericopes_data_v1.csv')
bible_df = pd.read_csv('./files/asv.csv')
pericopes_df['Pericope'] = pericopes_df['Pericope'].fillna('')

bible_df['Chapter'] = bible_df['Chapter'].astype(int)
bible_df['Verse'] = bible_df['Verse'].astype(int)

bible_df.rename(columns={'Book Name' : 'Book'}, inplace=True)
bible_df.drop(columns=['Book Number'], inplace=True)

bible_df['Text'] = bible_df['Text'].fillna('')

bible_df.head()

### If we want to make any changes to our data, we can do it here

In [None]:
# We can add or remove pericopes from the dataframe here:

# # Add "The ten plagues in Egypt" to the dataframe
# new_row = pd.DataFrame({
#     'Book': ['Exodus'],
#     'Chapter': [7],
#     'Pericope': ['The ten plagues in Egypt'],
#     'RefId': ['Exodus 7:14-13:6'],
#     'start': ['Exodus.7.14'],
#     'end': ['Exodus.13.6'],
# })

# pericopes_df = pd.concat([pericopes_df, new_row], ignore_index=True)

In [None]:
def create_pid(row):
    pericope = row['Pericope']
    reference = row['RefId'].split(':')[0]

    # Remove anything that is not a letter, digit or a space
    pericope = re.sub(r'[^\w\s]', '', pericope)
    title = f"{reference}__{pericope}".replace('  ', ' ')
    id = title.replace('"', '').replace("'", '').upper()
    id = ' '.join(id.split())
    id = id.replace(' ', '_')
    return id

# Apply the function to the 'Pericope' column
pericopes_df['PID'] = pericopes_df.apply(create_pid, axis=1)

#### `get_pericope_text` merges all the verses of a pericope into one string

In [None]:
def get_pericope_text(book, start, end):
    start_chapter, start_verse = map(int, start.split('.')[1:])
    end_chapter, end_verse = map(int, end.split('.')[1:])

    if start_chapter == end_chapter:
        # Pericope is within one chapter
        pericope_text = bible_df[(bible_df['Book'] == book) &
                                 (bible_df['Chapter'] == start_chapter) &
                                 (bible_df['Verse'].between(start_verse, end_verse))]['Text']
        return ' '.join(pericope_text)
    else:
        # Pericope spans multiple chapters
        pericope_text = []
        for chapter in range(start_chapter, end_chapter + 1):
            if chapter == start_chapter:
                # First chapter, get all verses starting from start_verse
                verses = bible_df[(bible_df['Book'] == book) &
                                  (bible_df['Chapter'] == chapter) &
                                  (bible_df['Verse'] >= start_verse)]['Text']
            elif chapter == end_chapter:
                # Last chapter, get all verses up to end_verse
                verses = bible_df[(bible_df['Book'] == book) &
                                  (bible_df['Chapter'] == chapter) &
                                  (bible_df['Verse'] <= end_verse)]['Text']
            else:
                # Middle chapters, get all verses
                verses = bible_df[(bible_df['Book'] == book) &
                                  (bible_df['Chapter'] == chapter)]['Text']
            pericope_text.extend(verses)
    return ' '.join(pericope_text)


### Prepare the data into a format that can be used by the model

In [None]:
pericopes_content = []
for _, row in pericopes_df.iterrows():
    pericope_text = get_pericope_text(row['Book'], row['start'], row['end'])
    pericope = row['Pericope']
    reference = row['RefId']
    ref_start = row['start']
    ref_end = row['end']
    pericopes_content.append({
        'PID': row['PID'],
        'Pericope': pericope,
        'Text': pericope_text,
        'Reference': reference,
        'RefStart': ref_start,
        'RefEnd': ref_end
      })

pericopes_content_df = pd.DataFrame(pericopes_content)

# Fill NaNs with empty string to avoid parsing issues
pericopes_content_df['Text'] = pericopes_content_df['Text'].fillna('')


#### Setup an iterrable array

> Might remove this later as the dataframe is already iterrable

In [None]:
pericope_data = []

for index, row in pericopes_content_df.iterrows():
  data_row = {}
  data_row['pid'] = row['PID']
  data_row['title'] = row['Pericope']
  data_row['content'] = row['Text']
  data_row['reference'] = row['Reference']
  data_row['start'] = row['RefStart']
  data_row['end'] = row['RefEnd']
  pericope_data.append(data_row)


## Add the data (and have it embedded) in ChromaDB

In [None]:
def add_data_to_collection(data_list):
  # Add pericopes to the Chroma collection, it can be a batch of data
  for pericope in data_list:
    title = pericope['title']
    pid = pericope['pid']
    content = pericope["content"]
    reference = pericope["reference"]
    uid = str(uuid.uuid4())
    

    metadatas = {
        "id": uid,
        "pid": pid,
        "pericope": title,
        "reference": reference,
        "content": content,
        "start": pericope['start'],
        "end": pericope['end']
      }

  try:
    collection.upsert(
        ids=[uid],
        documents=[content],
        metadatas=[metadatas]
    )
    return True
  except Exception as e:
    print(f'Error upserting "{reference}, {title}" to collection')
    raise e

In [None]:
for i in range(0, len(pericope_data)):
  add_data_to_collection(pericope_data[i:i+1])
  title = pericope_data[i]['title']
  reference = pericope_data[i]['reference']

  print(f'Upserted {i + 1} of {len(pericope_data)}: {reference} -- "{title}" to collection')
  print(f'Added ')
  # sleep to avoid rate limiting
  sleep(0.1)


## Predict the pericope of a given a user inputted Story Name

In [None]:
# Function to find the closest matches using Chroma
def predict_pericopes(story_name, n=10):
    results = collection.query(
        query_texts=story_name,
        n_results=n,
        include=["metadatas", "distances"]
    )
    matched_pericopes = results
    array_of_objects = [{ 'id': id_val, "pericope": meta['pericope'], 'distance': dist, 'metadata': { "pericope": meta['pericope'], "reference": meta['reference'], "start": meta['start'], "end": meta['end'] } }
        for id_list, dist_list, meta_list in zip(matched_pericopes['ids'], matched_pericopes['distances'], matched_pericopes['metadatas'])
        for id_val, dist, meta in zip(id_list, dist_list, meta_list)
    ]
    pretty_json = json.dumps(array_of_objects, indent=4)
    return pretty_json

# user_query = "God parts the Red sea"
# user_query = "Jesus walking on water"
# user_query = "John gets to the grave before Peter"
user_query = "Peter falls in sea lack of faith"

print(predict_pericopes(user_query))
