In [27]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import PyPDF2
from io import BytesIO

# Function to fetch and parse text content from plain text or HTML
def fetch_text(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        content_type = response.headers.get('Content-Type', '')
        
        if 'text/plain' in content_type:
            return response.text
        elif 'text/html' in content_type:
            soup = BeautifulSoup(response.text, 'html.parser')
            return soup.get_text()
        else:
            return None
    except Exception as e:
        return str(e)

# Function to extract text from PDF
def fetch_pdf_text(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        pdf_file = BytesIO(response.content)
        reader = PyPDF2.PdfReader(pdf_file)
        text = []
        for page in reader.pages:
            text.append(page.extract_text())
        return "\n".join(text)
    except Exception as e:
        return str(e)

# URLs
urls = [
    "https://gutenberg.org/cache/epub/6130/pg6130.txt",
    "https://ia601606.us.archive.org/25/items/pdfy-jcWLRBnyXg-DUcMH/The%20Corpus%20Hermeticum.pdf",
    "https://openbible.com/textfiles/kjv.txt",
    "https://www.gutenberg.org/files/2388/2388-h/2388-h.htm",
    "https://www.avesta.org/kanga/ka_english_kanga_epub.pdf",
    "https://archive.org/stream/RigVedaSanhitaByHHWilsonVol1/Rig%20Veda%20Sanhita%20by%20H%20H%20Wilson%20Vol%201_djvu.txt",
    "https://www.gutenberg.org/files/2017/2017-h/2017-h.htm",
]

names = [
    "Iliad",
    "Corpus Hermeticum",
    "Bible",
    "Bhagavad Geeta",
    "Khordeh Avesta",
    "Rigveda",
    "Dhammapada"
]
# Fetch content
data = []
for url,name in zip(urls,names):
    if url.endswith('.pdf'):
        text = fetch_pdf_text(url)
    else:
        text = fetch_text(url)
    data.append({"url": url, "content": text,"name":name})

# Create DataFrame
df = pd.DataFrame(data)

# Save to a file
df.to_csv("texts.csv", index=False)

print("Data fetched and saved to 'texts.csv'")


Data fetched and saved to 'texts.csv'


In [28]:
df = pd.read_csv("texts.csv")

In [29]:
df

Unnamed: 0,url,content,name
0,https://gutenberg.org/cache/epub/6130/pg6130.txt,﻿The Project Gutenberg eBook of The Iliad\r\n ...,Iliad
1,https://ia601606.us.archive.org/25/items/pdfy-...,The Corpus Hermeticum\ntranslated by G.R.S. M...,Corpus Hermeticum
2,https://openbible.com/textfiles/kjv.txt,ï»¿KJV\nKing James Bible: Pure Cambridge Editi...,Bible
3,https://www.gutenberg.org/files/2388/2388-h/23...,\n\n\n\n\r\nThe Project Gutenberg E-text of Th...,Bhagavad Geeta
4,https://www.avesta.org/kanga/ka_english_kanga_...,"KHORDEH A VEST Ā\nComprising\nAshem, Yatha, th...",Khordeh Avesta
5,https://archive.org/stream/RigVedaSanhitaByHHW...,"\n\n\n\nFull text of ""Rig Veda Sanhita By H H ...",Rigveda
6,https://www.gutenberg.org/files/2017/2017-h/20...,"\n\n\n\n\n The Dhammapada, by an Unknown ...",Dhammapada


In [32]:
import re
from nltk.tokenize import sent_tokenize

def preprocess_text(text):
    # Remove non-alphanumeric characters, normalize whitespace
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    sentences = sent_tokenize(text)
    return sentences
df['text'] = df['content'].apply(preprocess_text)

In [33]:
from transformers import pipeline

summarizer = pipeline('summarization', model='facebook/bart-large-cnn')

df['themes'] = df['text'].apply(lambda x: summarizer(x, max_length=50, min_length=10, do_sample=False))

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.hf.co/facebook/bart-large-cnn/40041830399afb5348525ef8354b007ecec4286fdf3524f7e6b54377e17096cb?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1733735189&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMzczNTE4OX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9mYWNlYm9vay9iYXJ0LWxhcmdlLWNubi80MDA0MTgzMDM5OWFmYjUzNDg1MjVlZjgzNTRiMDA3ZWNlYzQyODZmZGYzNTI0ZjdlNmI1NDM3N2UxNzA5NmNiP3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiJ9XX0_&Signature=meQUYhgQtZHgWvEdU8tx5p44unF3JnozwpkxO9PoBzK-MB-K7HC9zxM-QZN%7ELRiCoQuDQ1GpikDCxVujp%7E2Fc1s138yzKQZL9gZNiWLOKEdUGHxyPKigKo4g2TwVXCIb6tLpG5SZrQ7dCN%7E4Rm6G2wyqW2tdu4dhbdMZTW15sDaFlClhWpMpXSEgOxt7yG8R4QkVuZcm4vrQgEOGDw3yEqoOq4hixQV1rc2-jaI5q4p6tUSC%7EwJHnZsZe8xEFWXUbROeaTpfQy%7EpdXa3GNZrTqT8oc3IevdoaV9WnPNvYAXH0sWuXvrGTWbswgJTB0XDF2UyLuUkjc3KWe8Fh2gDxw__&Key-Pair-Id=K3RPWS32NSSJCE: HTTPSConnec

model.safetensors:  67%|######7   | 1.09G/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


IndexError: index out of range in self

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

df['embeddings'] = df['text'].apply(lambda x: model.encode(x))

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(df['embeddings'].tolist())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['text'])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.heatmap(similarity_matrix, annot=True, cmap='coolwarm')
plt.title('Thematic Similarity')
plt.show()

In [None]:
from wordcloud import WordCloud

wordcloud = WordCloud().generate(' '.join(df['text']))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()