In [31]:
#imports
import pandas as pd
from gensim import corpora, models
from gensim.models import Phrases
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/humbertoaguilar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/humbertoaguilar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/humbertoaguilar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/humbertoaguilar/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [32]:
stop_words = set(stopwords.words('english'))

additional_stop_words = {'thou', 'thy', 'thee', 'shall', 'unto', 'ye', 'nephi', 'alma', 'god', 'lord', 'one', 'name', 'ahura',
                         'tao','said', 'jacob', 'hunahpú', 'u', 'say', 'mazda', 'יהוה', 'joseph',
                         'arjuna', 'balam', 'abraham', 'nanak', 'guru', 'naam', 'mehl', 'pas', 'xibalba',
                         'dhammas', 'us', 'tathāgata', 'fravashi', 'august', 'augustness', 'great',
                         'holy', 'true', 'things', 'truth', 'pause', 'har', 'made', 'came', 'prince',
                         'krishna', 'soul', 'life', 'hath', 'know', 'pass', 'behold', 'upon', 'also',
                         'even', 'called', 'lords', 'went', 'boys', 'well', 'come', 'therefore',
                         'without', 'blessed', 'vucub', 'two', 'thus', 'like', 'way', 'virtue',
                         'heb', 'self', 'yet', 'yea', 'mosiah', 'lamanites', 'dhamma', 'go',
                         'tohil', 'xbalanqué', 'zarathushtra', 'saint', 'brethren', 'would', 'ānanda',
                         'saying', 'princess', 'heavenly', 'names', 'camé', 'doth', 'quitzé', 'shabad',
                         'glory', 'rightly', 'eight', 'hun', 'four', 'verily', 'helaman', 'chapter',
                         'see', 'within', 'let', 'may', 'indeed', 'sent', 'tribes', 'father', 'son',
                         'sons', 'isaac', 'wife', 'signs', 'deity', 'deities', 'grace', 'obtained',
                         'heart', 'king', 'according', 'wise', 'pharoah', 'brother', 'gave', 'next', 'p',
                         'esau', 'first', 'yamato_take', 'nephites', 'abram', 'pharaoh'}

combined_stop_words = stop_words.union(additional_stop_words)

lemmatizer = WordNetLemmatizer()

In [33]:
def preprocess(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    words = nltk.word_tokenize(text.lower())  # Tokenize the text
    filtered_words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in combined_stop_words]
    return filtered_words

def generate_ngrams(texts, n=3):
    bigram = Phrases(texts, min_count=3, threshold=10)
    trigram = Phrases(bigram[texts], threshold=10)
    texts = [trigram[bigram[text]] for text in texts]

    # Ensure filtering of n-grams
    filtered_texts = [[word for word in text if word not in combined_stop_words and '_' not in word] for text in texts]

    return filtered_texts


In [34]:
# Load CSV file
df = pd.read_csv('Dataset_with_Text.csv')

documents = df['Text'].tolist()
names = df['Name_of_Text'].tolist()

# Process each text separately
topics_by_document = {}

for i, document in enumerate(documents):
    processed_doc = preprocess(document)
    processed_doc = generate_ngrams([processed_doc])[0]

    dictionary = corpora.Dictionary([processed_doc])
    corpus = [dictionary.doc2bow(processed_doc)]

    # Train LDA model for the specific document
    lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)  # Adjust num_topics as needed

    # Extract the top words for each topic
    topics = []
    for idx, topic in lda_model.print_topics(-1):
        topic_words = [word.split('*')[1].strip('"') for word in topic.split(' + ')]
        # Remove duplicates, filter out stop words, and ensure exactly 10 words
        topic_words = [word for word in dict.fromkeys(topic_words) if word not in combined_stop_words][:10]
        while len(topic_words) < 10:
            topic_words.append("")  # Add empty strings to maintain length
        topics.append(', '.join(topic_words))

    topics_by_document[names[i]] = topics


In [35]:
# Display the results
for text, topics in topics_by_document.items():
    print(f"{text}: {', '.join(topics)}")

Bhagavad Gita: work, mind, men, world, man, love, passion, sense, act, birth, work, world, worship, faith, mind, men, act, sense, good, brahma, work, world, mind, man, men, faith, act, sense, worship, spirit, work, world, mind, peace, faith, man, men, sense, spirit, act, work, man, world, faith, mind, end, light, sense, sacrifice, set
Book of Mormon: people, many, word, land, ether, day, wherefore, might, man, moroni, people, land, many, word, day, moroni, wherefore, began, might, ether, people, word, land, many, might, wherefore, moroni, ether, began, day, people, land, word, many, might, ether, began, day, forth, among, people, many, word, land, wherefore, forth, began, moroni, power, man
Digha Nikaya: monk, perception, body, feeling, form, friend, three, mind, world, discerns, monk, body, perception, form, world, three, friend, mind, feeling, seven, monk, body, friend, form, three, perception, world, mind, feeling, regard, monk, form, perception, feeling, three, body, world, mind, c

In [37]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine
import geopandas as gpd
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt

class ReligiousTextAnalyzer:
    def __init__(self, texts_df):
        """
        Initialize with a DataFrame containing religious texts and metadata
        Expected columns: Name_of_Text, Text, Region, Climate_Data, Year_Approximate
        """
        self.texts_df = texts_df
        self.topic_similarities = None
        self.environmental_correlations = None
        
    def calculate_topic_similarities(self, topics_by_document):
        """
        Calculate similarity matrix between documents based on their topics
        """
        texts = list(topics_by_document.keys())
        n_texts = len(texts)
        similarity_matrix = np.zeros((n_texts, n_texts))
        
        # Create a flat list of all topics for each text
        text_topics = {text: ' '.join(topics) for text, topics in topics_by_document.items()}
        
        # Calculate TF-IDF vectors for topics
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(text_topics.values())
        
        # Calculate cosine similarities
        for i in range(n_texts):
            for j in range(n_texts):
                if i != j:
                    similarity = 1 - cosine(
                        tfidf_matrix[i].toarray().flatten(),
                        tfidf_matrix[j].toarray().flatten()
                    )
                    similarity_matrix[i, j] = similarity
        
        self.topic_similarities = pd.DataFrame(
            similarity_matrix,
            index=texts,
            columns=texts
        )
        return self.topic_similarities
    
    def analyze_environmental_correlations(self, climate_data):
        """
        Analyze correlations between topic presence and environmental variables
        
        climate_data: DataFrame with columns for different environmental variables
        """
        # Normalize climate data
        scaler = MinMaxScaler()
        normalized_climate = pd.DataFrame(
            scaler.fit_transform(climate_data),
            columns=climate_data.columns,
            index=climate_data.index
        )
        
        # Calculate correlations between topic similarities and environmental variables
        correlations = {}
        for var in normalized_climate.columns:
            env_similarities = pdist(normalized_climate[var].values.reshape(-1, 1))
            topic_similarities_flat = squareform(self.topic_similarities)
            correlation = pearsonr(env_similarities, topic_similarities_flat)[0]
            correlations[var] = correlation
            
        self.environmental_correlations = pd.Series(correlations)
        return self.environmental_correlations
    
    def visualize_results(self):
        """
        Create visualizations of the analysis results
        """
        plt.figure(figsize=(15, 10))
        
        # Topic similarity heatmap
        plt.subplot(1, 2, 1)
        sns.heatmap(
            self.topic_similarities,
            cmap='YlOrRd',
            annot=True,
            fmt='.2f'
        )
        plt.title('Topic Similarities Between Religious Texts')
        
        # Environmental correlations
        plt.subplot(1, 2, 2)
        self.environmental_correlations.plot(kind='bar')
        plt.title('Environmental Variable Correlations')
        plt.xticks(rotation=45)
        
        plt.tight_layout()
        plt.show()
    
    def generate_report(self):
        """
        Generate a summary report of findings
        """
        report = []
        
        # Most similar text pairs
        similarities = self.topic_similarities.unstack()
        top_similarities = similarities[similarities != 1.0].nlargest(5)
        report.append("Most similar text pairs:")
        for (text1, text2), similarity in top_similarities.items():
            report.append(f"{text1} - {text2}: {similarity:.3f}")
            
        # Strongest environmental correlations
        report.append("\nStrongest environmental correlations:")
        for var, corr in self.environmental_correlations.nlargest(3).items():
            report.append(f"{var}: {corr:.3f}")
            
        return "\n".join(report)

In [38]:
# Create analyzer instance
analyzer = ReligiousTextAnalyzer(df)

# Calculate similarities using your existing topic analysis
similarities = analyzer.calculate_topic_similarities(topics_by_document)

# Add environmental data
climate_data = pd.DataFrame({
    'annual_rainfall': [...],
    'avg_temperature': [...],
    'elevation': [...],
    'water_proximity': [...]
}, index=df['Name_of_Text'])

# Analyze correlations
correlations = analyzer.analyze_environmental_correlations(climate_data)

# Visualize results
analyzer.visualize_results()

# Generate report
print(analyzer.generate_report())

TypeError: float() argument must be a string or a real number, not 'ellipsis'