In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import json

with open("/kaggle/input/json-files/gnostic_clean.json","r") as f:
    gnostic = json.load(f)

with open("/kaggle/input/json-files/nrsv.json","r") as f:
    canon = json.load(f)

In [None]:
bible=[]
for book,content in canon.items():
    for ch,vers in content.items():
        for vn,vt in vers.items():
            bible.append(f"{book} {ch}:{vn} - {vt}")

In [None]:
bible[21434:21436]

In [None]:
!pip install sentence-transformers

### bible embeddings

In [None]:
from sentence_transformers import SentenceTransformer,util

In [None]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

model = SentenceTransformer("all-MiniLM-L6-v2",device=device)

bibleemb = model.encode(bible,show_progress=True,convert_to_numpy=True)

In [None]:
np.save("bible_embeddings.npy",bibleemb)

In [None]:
np.save("bible_sent.npy",bible)

In [None]:
got = gnostic['Gospel of Thomas']

In [2]:
import re

def clean_text(text):
    # Remove brackets but keep the content inside
    text = re.sub(r'[\[\]\(\)\{\}]', '', text)
    # Remove single and double quotes
    text = re.sub(r'[\"\']', '', text)
    text = re.sub(r'\.{3,}', '', text)
    # Remove all characters except alphabets, spaces, and .
    text = re.sub(r'[^a-zA-Z. ]+', '', text)
    # Replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    # Trim leading/trailing whitespace
    return text.strip()

In [None]:
got = clean_text(got)

In [None]:
passage = got[100:386]

In [None]:
passemb = model.encode([passage],convert_to_numpy=True)
sim = util.cos_sim(passemb,bibleemb)[0]

In [None]:
top5 = torch.topk(sim,k=5)

for v,i in zip(top5.values,top5.indices):
    print("score: ",v)
    print("verse: ", bible[i])

#### Sliding window over gnostic text

In [None]:
window_size=100
stride=50

words = got.split()
windows = []
for i in range(0, len(words) - window_size + 1, stride):
    window = ' '.join(words[i:i + window_size])
    windows.append(window)

In [None]:
windowsemb = model.encode(windows,convert_to_numpy=True)

In [None]:
sim_mat = util.cos_sim(windowsemb,bibleemb)
sim_mat.shape

In [None]:
topscore,topind = torch.topk(sim_mat,k=1)

In [None]:
matches=[]
for i,w in enumerate(windows):
    idx = topind[i].item()
    matches.append({
        "nag hammadi":w,
        "canon": bible[idx],
        "score":topscore[i].item()
    })

### Text Segmentation : semantic chunking

In [None]:
files = os.listdir("/kaggle/input/gnostic-text-files")

corpus = {}
for file in files:
    path = os.path.join("/kaggle/input/gnostic-text-files",file)
    with open(path,"r",encoding='utf-8') as f:
        text = f.read()
        corpus[file.split(".")[0].strip()] = text

In [None]:
corpus['On the Baptism A']

In [None]:
for k,v in corpus.items():
    corpus[k] = v.split(':',1)[1].strip()

In [None]:
import nltk
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Download punkt tokenizer
nltk.download('punkt')

def semantic_chunker(text, model_name='all-MiniLM-L6-v2', similarity_threshold=0.75):
    # 1. Split into sentences
    sentences = nltk.sent_tokenize(text)

    # 2. Generate sentence embeddings
    model = SentenceTransformer(model_name,device='cuda')
    embeddings = model.encode(sentences, show_progress=True, convert_to_numpy=True)

    # 3. Compute cosine similarity between adjacent sentence pairs
    similarities = cosine_similarity(embeddings)
    similarities = np.diagonal(similarities, offset=1)

    # 4. Decide chunk boundaries based on low similarity
    chunk_indices = [0]
    for i, sim in enumerate(similarities):
        if sim < similarity_threshold:
            chunk_indices.append(i + 1)
    chunk_indices.append(len(sentences))

    # 5. Group sentences into chunks
    chunks = []
    for i in range(len(chunk_indices) - 1):
        start, end = chunk_indices[i], chunk_indices[i+1]
        chunk = " ".join(sentences[start:end])
        chunks.append(chunk)

    return chunks

In [None]:
chunks = semantic_chunker(corpus['On the Baptism A'])

In [None]:
import re
def clean(text):
    text = re.sub(r'\s+',' ',text)
    return text

new_corpus= {}
for k in corpus.keys():
    new_corpus[k] = semantic_chunker(clean(corpus[k]))

In [None]:
import json
with open("gnostic_chunks.json","w") as f:
    json.dump(new_corpus,f)

#### bible chunks

In [None]:
bible={}
for book,content in canon.items():
    verslst = []
    for ch,vers in content.items():
        for vn,vt in vers.items():
            verslst.append(vt)
    bible[book] = "".join(verslst)

In [None]:
new_bible = {}

for k in bible.keys():
    new_bible[k] = semantic_chunker(clean(bible[k]))

In [None]:
with open("bible_chunks.json","w") as f:
    json.dump(new_bible,f)

### BERTopic

#### passage level theme/topic modelling on gnostic texts:
* segmented gnostic books in chunks
* collected all chunks as one text corpus
* find topic using bertopic

In [None]:
!pip install bertopic

In [3]:
from sentence_transformers import SentenceTransformer

2025-04-17 13:41:00.204895: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744897260.408749      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744897260.463461      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
import json
with open("/kaggle/working/gnostic_chunks.json","r") as f:
    gn = json.load(f)

In [None]:
got = gn['Gospel of Thomas']
got[:3]

In [None]:
texts = [clean_text(t) for t in got]
texts[:3]

In [24]:
metadata = []
texts = []

for k,chunks in gn.items():
    for i,chunk in enumerate(chunks):
        texts.append(clean_text(chunk))
        metadata.append({"book":k,"chunkid":i}) 

In [9]:
model = SentenceTransformer("all-MiniLM-L6-v2",device = 'cuda')
#gnemb = model.encode(texts,show_progress=True,convert_to_numpy=True)

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [12]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(stop_words="english")
topicmodel = BERTopic(vectorizer_model=vectorizer_model,embedding_model=model)

topics,probs = topicmodel.fit_transform(texts)

In [13]:
topicmodel.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3586,-1_said_spirit_come_place,"[said, spirit, come, place, things, came, powe...",[This is the first power which was before all ...
1,0,380,0_father_son_mother_totality,"[father, son, mother, totality, forth, totalit...",[The Father was in the Son and the Son in the ...
2,1,193,1_husband_woman_house_consort,"[husband, woman, house, consort, soul, repente...",[He presented it to her mouth to make her eat ...
3,2,193,2_aeons_aeon_worlds_glory,"[aeons, aeon, worlds, glory, change, establish...",[not through aeon the one who was we and those...
4,3,186,3_light_darkness_shadow_lamp,"[light, darkness, shadow, lamp, lights, dark, ...",[For even things which are in darkness are bef...
...,...,...,...,...,...
127,126,11,126_spoken_discourse_length_thingsthen,"[spoken, discourse, length, thingsthen, remark...",[Remember the things I have spoken and let the...
128,127,11,127_yaltabaoth_yaldabaoth_alternative_seraphs,"[yaltabaoth, yaldabaoth, alternative, seraphs,...",[And the chief archon Yaltabaoth heard it and ...
129,128,11,128_friend_deceitfully_deceiving_entrust,"[friend, deceitfully, deceiving, entrust, dece...",[My son do not have anyone as a friend but if ...
130,129,10,129_conceived_happened_impassible_design,"[conceived, happened, impassible, design, life...",[And when these things had happened Asclepius ...


In [15]:
topicmodel.get_topic(14)

[('authority', 0.04729504180653924),
 ('power', 0.030583950547074627),
 ('powers', 0.02973529498657805),
 ('punishments', 0.024096782221272234),
 ('continuously', 0.02228104966381938),
 ('hindered', 0.02228104966381938),
 ('confounded', 0.02228104966381938),
 ('knowledge', 0.021356488550367755),
 ('subjection', 0.02099584602669544),
 ('leading', 0.02000133127386422)]

### Zero-shot classification into Topics/themes

In [16]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


In [17]:
labels = [
    "Wisdom Saying",
    "Parable or Allegory",
    "Apocalyptic Vision",
    "Gnostic Revelation",
    "Jesus Saying",
    "Command or Instruction",
    "Light versus Darkness",
    "Salvation or Resurrection",
    "Creation Myth or Cosmology",
    "Sacramental or Ritual Language",
    "Dialogue or Discourse",
    "Mystical Metaphor",
    "Proverbial Saying",
    "Narrative Scene",
    "Polemic or Rejection of Authority",
    "Ethical Teaching",
    "Call to Inner Knowledge",
    "Kingdom of God Teaching",
    "Dualistic Theology",
    "Hidden or Secret Saying"
]

In [27]:
from tqdm import tqdm

results=[]
candidate_labels = labels
for text in tqdm(texts):
    if len(text)>0:
        res = classifier(text, candidate_labels,multi_label=True)
        top_labels = [label for score, label in sorted(zip(res["scores"], res["labels"]), reverse=True)[:3]]
    else:
        top_labels= '0'
    results.append({text:top_labels})

100%|██████████| 8373/8373 [53:56<00:00,  2.59it/s]  


[{'Eugnostos the Blessed to those who are his.': ['Proverbial Saying',
   'Wisdom Saying',
   'Kingdom of God Teaching']},
 {'Rejoice in this that you know.': ['Proverbial Saying',
   'Wisdom Saying',
   'Jesus Saying']},
 {'Greetings': ['Proverbial Saying',
   'Kingdom of God Teaching',
   'Sacramental or Ritual Language']}]

In [52]:
zsgn = {}

for mt,rs in zip(metadata,results):
    if mt['book'] not in zsgn:
        zsgn[mt['book']] = []
    else:
        for k,v in rs.items():
            ap = (k,v)
        zsgn[mt['book']].append(ap)

In [57]:
with open("zeroshot_gnotic.json","w") as f:
    json.dump(zsgn,f)