In [1]:
import numpy as np
import pandas as pd
import ast

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

import plotly.express as px
import plotly.graph_objects as go


In [2]:
df1 = pd.read_csv("gita_embeddings_ch01-09.csv")
df2 = pd.read_csv("gita_embeddings_ch10-18.csv")
df = pd.concat([df1, df2])
df

Unnamed: 0,id,embedding,chapter,verse,speaker,sanskrit,translation,youtube_link,download_link,metadata
0,1,"[0.015645906,0.00043402964,-0.0025344447,0.034...",1,1,Unknown,धर्मक्षेत्रे कुरुक्षेत्रे समवेता युयुत्सवः| मा...,"Dhritarashtra said, ""What did my people and th...",https://www.youtube.com/watch?v=lW1kilajTUs,/static/gita_portal_data/pdfs/1-1.pdf,"{'source': 'gita_collection', 'filename': 'git..."
1,2,"[-0.001357,0.028066132,0.00087169994,0.0150144...",1,2,Unknown,दृष्ट्वा तु पाण्डवानीकं व्यूढं दुर्योधनस्तदा| ...,Sanjaya said: Having seen the army of the Pand...,https://www.youtube.com/watch?v=GUcWC2eC1Mc,/static/gita_portal_data/pdfs/1-2.pdf,"{'source': 'gita_collection', 'filename': 'git..."
2,3,"[-0.012270631,0.028635934,-0.0051919357,0.0369...",1,3,Unknown,पश्यैतां पाण्डुपुत्राणामाचार्य महतीं चमूम्| व्...,"Behold, O Teacher! This mighty army of the son...",https://www.youtube.com/watch?v=AfwJNRcLxLk,/static/gita_portal_data/pdfs/1-3.pdf,"{'source': 'gita_collection', 'filename': 'git..."
3,4,"[-0.01089979,0.018903596,-0.012380992,0.043891...",1,4,Unknown,अत्र शूरा महेष्वासा भीमार्जुनसमा युधि| युयुधान...,"Here are heroes, mighty archers, equal in batt...",https://www.youtube.com/watch?v=h9EERqjLnXY,/static/gita_portal_data/pdfs/1-4.pdf,"{'source': 'gita_collection', 'filename': 'git..."
4,5,"[0.016647292,0.032425787,-0.00975296,0.0399647...",1,5,Unknown,धृष्टकेतुश्चेकितानः काशिराजश्च वीर्यवान्| पुरु...,"Dhrishtaketu, Chekitana, the valiant king of K...",https://www.youtube.com/watch?v=gxhVsuCTm6s,/static/gita_portal_data/pdfs/1-5.pdf,"{'source': 'gita_collection', 'filename': 'git..."
...,...,...,...,...,...,...,...,...,...,...
323,696,"[0.0020957803,0.00066403055,-0.008532095,0.048...",18,74,Unknown,इत्यहं वासुदेवस्य पार्थस्य च महात्मनः| संवादमि...,"Sanjaya said, Thus, I have heard this wonderfu...",https://www.youtube.com/watch?v=D-r7ap8dscs,/static/gita_portal_data/pdfs/18-74.pdf,"{'source': 'gita_collection', 'filename': 'git..."
324,697,"[0.0051827244,0.024626028,-0.0076155635,0.0130...",18,75,Unknown,व्यासप्रसादाच्छ्रुतवानेतद्गुह्यमहं परम्| योगं ...,"Through the grace of Vyasa, I have heard this ...",https://www.youtube.com/watch?v=hrxf3GTgYYE,/static/gita_portal_data/pdfs/18-75.pdf,"{'source': 'gita_collection', 'filename': 'git..."
325,698,"[0.009442715,0.030365312,-0.017937316,0.027315...",18,76,Unknown,राजन्संस्मृत्य संस्मृत्य संवादमिममद्भुतम्| केश...,"O King, remembering this wonderful and holy di...",https://www.youtube.com/watch?v=SsgKLgAmFzc,/static/gita_portal_data/pdfs/18-76.pdf,"{'source': 'gita_collection', 'filename': 'git..."
326,699,"[-0.016666906,0.018928178,-0.014977602,0.01965...",18,77,Unknown,तच्च संस्मृत्य संस्मृत्य रूपमत्यद्भुतं हरेः| व...,"And, remembering again and again that most won...",https://www.youtube.com/watch?v=EQoCaIe_8T0,/static/gita_portal_data/pdfs/18-77.pdf,"{'source': 'gita_collection', 'filename': 'git..."


In [3]:

# 1. PREPARE DATA
chapter_docs = df.groupby('chapter')['translation'].apply(' '.join).reset_index()

# 2. EXTRACT KEYWORDS
tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
tfidf_matrix = tfidf.fit_transform(chapter_docs['translation'])
feature_names = np.array(tfidf.get_feature_names_out())

# 3. HELPER FUNCTION
def get_top_n_words(chapter_str, n=20):
    idx = chapter_docs.index[chapter_docs['chapter'].astype(str) == str(chapter_str)].tolist()
    if not idx: return set()
    row_vector = tfidf_matrix[idx[0]].toarray().flatten()
    top_indices = row_vector.argsort()[-n:][::-1]
    return set(feature_names[top_indices])

# 4. BUILD THE DATAFRAME
results_list = []
all_chapters = [str(i) for i in range(1, 19)]

for chap in all_chapters:
    keywords_target = get_top_n_words(chap, n=20)
    
    # --- Metric A: Distinctness ---
    keywords_all_others = set()
    for other in all_chapters:
        if other != chap:
            keywords_all_others.update(get_top_n_words(other, n=20))
            
    unique_words = keywords_target - keywords_all_others
    distinct_score = (len(unique_words) / 20.0) * 100
    
    # --- Metric B: Closest Match ---
    best_match_chap = None
    max_overlap_count = -1
    
    for other in all_chapters:
        if other == chap: continue
        keywords_other = get_top_n_words(other, n=20)
        current_overlap = len(keywords_target.intersection(keywords_other))
        
        if current_overlap > max_overlap_count:
            max_overlap_count = current_overlap
            best_match_chap = other
            
    overlap_percent = (max_overlap_count / 20.0) * 100
    
    # Store row data
    results_list.append({
        'Chapter': f"Ch {chap}",
        'Distinctness': f"{distinct_score:.0f}%",
        'Closest_Match': f"Ch {best_match_chap} ({overlap_percent:.0f}%)",
        'Unique_Keywords': ', '.join(list(unique_words)) if unique_words else "None"
    })

# Convert list of dicts to DataFrame
df_insights = pd.DataFrame(results_list)

# Display
df_insights

# Optional: Save to CSV
# df_insights.to_csv("gita_chapter_insights.csv", index=False)

Unnamed: 0,Chapter,Distinctness,Closest_Match,Unique_Keywords
0,Ch 1,80%,Ch 2 (15%),"kill, army, armies, son, fathers, chariot, pan..."
1,Ch 2,45%,Ch 3 (20%),"pleasure, wisdom, slain, casts, away, peace, g..."
2,Ch 3,40%,Ch 2 (20%),"perform, engage, sinful, follow, path, envelop..."
3,Ch 4,55%,Ch 5 (30%),"success, breath, inaction, taught, offer, shal..."
4,Ch 5,45%,Ch 4 (30%),"yogis, reached, moksha, freedom, truly, acting..."
5,Ch 6,40%,Ch 5 (20%),"restless, attained, bliss, difficult, let, con..."
6,Ch 7,65%,Ch 9 (20%),"illusion, steadfast, composed, men, higher, vi..."
7,Ch 8,75%,Ch 4 (10%),"night, goes, yugas, time, times, attainable, l..."
8,Ch 9,55%,Ch 10 (25%),"devotion, manes, forth, vain, goal, kalpa, com..."
9,Ch 10,60%,Ch 9 (25%),"prosperous, prosperity, things, fame, sages, g..."


In [4]:
df["embedding"] = df["embedding"].apply(ast.literal_eval)
embeddings = df["embedding"].to_list()

In [5]:
# ==========================================
# 2. PERFORM PCA
# ==========================================
# Initialize PCA to reduce to 2 dimensions
pca = PCA(n_components=2)

# Fit and transform the embeddings
components = pca.fit_transform(embeddings)


In [6]:
# ==========================================
# 3. PREPARE DATAFRAME FOR PLOTTING
# ==========================================
# Create a DataFrame with the PCA results and the text IDs
df_pca = pd.DataFrame(data=components, columns=['PC1', 'PC2'])
df_pca['chapter'] = df["chapter"].to_list()
df_pca['verse'] = df["verse"].to_list()

In [7]:
# ---------------------------------------------------------
# 1. CALCULATE CENTROID AND VARIANCE
# ---------------------------------------------------------
# Group by chapter and calculate mean and variance for PC1 and PC2
chapter_stats = df_pca.groupby('chapter')[['PC1', 'PC2']].agg(['mean', 'var'])

# Flatten the hierarchical columns (e.g., ('PC1', 'mean') -> 'PC1_mean')
chapter_stats.columns = ['_'.join(col).strip() for col in chapter_stats.columns.values]
chapter_stats = chapter_stats.reset_index()

# Calculate Standard Deviation (for plotting circles)
# We take the square root of the variance
chapter_stats['PC1_std'] = np.sqrt(chapter_stats['PC1_var'])
chapter_stats['PC2_std'] = np.sqrt(chapter_stats['PC2_var'])

# ---------------------------------------------------------
# 2. PLOT
# ---------------------------------------------------------
# Create the base scatter plot of all verses
fig = px.scatter(
    df_pca, 
    x='PC1', 
    y='PC2', 
    color='chapter',
    hover_name='chapter',
    title='PCA with Chapter Centroids and Variance',
    template='plotly_white',
    opacity=0.4  # Make the points slightly transparent to see centroids better
)

# Add the CENTROIDS as a new trace (Large 'X' markers)
fig.add_trace(
    go.Scatter(
        x=chapter_stats['PC1_mean'],
        y=chapter_stats['PC2_mean'],
        mode='markers',
        marker=dict(symbol='x', size=12, color='black', line=dict(width=2)),
        name='Centroids',
        text=chapter_stats['chapter'],
        hoverinfo='text'
    )
)

# Add VARIANCE circles (1 Standard Deviation)
# We loop through each chapter to draw a circle around its centroid
for i, row in chapter_stats.iterrows():
    fig.add_shape(
        type="circle",
        xref="x", yref="y",
        x0=row['PC1_mean'] - row['PC1_std'], 
        y0=row['PC2_mean'] - row['PC2_std'],
        x1=row['PC1_mean'] + row['PC1_std'], 
        y1=row['PC2_mean'] + row['PC2_std'],
        line_color="black",
        opacity=0.3
    )

fig.show()

In [8]:


# 1. Convert embeddings to a numpy array if not already
matrix = np.array(embeddings)

# 2. Run t-SNE
# perplexity: roughly the number of neighbors to consider (try 30 or 50)
# n_iter: number of iterations (try 1000 for stability)
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
tsne_results = tsne.fit_transform(matrix)

# 3. Create a DataFrame for plotting
df_tsne = df.copy()
df_tsne['x'] = tsne_results[:, 0]
df_tsne['y'] = tsne_results[:, 1]
# Ensure chapter is a string so Plotly treats it as a discrete category
df_tsne['chapter'] = df_tsne['chapter'].astype(str)


# ---------------------------------------------------------
# 1. CALCULATE CENTROID AND VARIANCE FOR t-SNE
# ---------------------------------------------------------
# Group by chapter and calculate mean and variance for x and y
# Note: We use 'x' and 'y' because that's what we named the t-SNE columns
chapter_stats_tsne = df_tsne.groupby('chapter')[['x', 'y']].agg(['mean', 'var'])

# Flatten the hierarchical columns
chapter_stats_tsne.columns = ['_'.join(col).strip() for col in chapter_stats_tsne.columns.values]
chapter_stats_tsne = chapter_stats_tsne.reset_index()

# Calculate Standard Deviation (for plotting circles)
chapter_stats_tsne['x_std'] = np.sqrt(chapter_stats_tsne['x_var'])
chapter_stats_tsne['y_std'] = np.sqrt(chapter_stats_tsne['y_var'])


# ---------------------------------------------------------
# 2. PLOT
# ---------------------------------------------------------
# Create the base scatter plot
fig = px.scatter(
    df_tsne, 
    x='x', 
    y='y', 
    color='chapter',
    hover_name='chapter',
    # hover_data=['verse_id'], # Uncomment if you have this column
    title='t-SNE with Chapter Centroids and Variance',
    template='plotly_white',
    opacity=0.3  # Lower opacity to make centroids pop
)

# Add the CENTROIDS as a new trace (Large 'X' markers)
fig.add_trace(
    go.Scatter(
        x=chapter_stats_tsne['x_mean'],
        y=chapter_stats_tsne['y_mean'],
        mode='markers',
        marker=dict(symbol='x', size=12, color='black', line=dict(width=2)),
        name='Centroids',
        text=chapter_stats_tsne['chapter'],
        hoverinfo='text'
    )
)

# Add VARIANCE circles (1 Standard Deviation)
for i, row in chapter_stats_tsne.iterrows():
    fig.add_shape(
        type="circle",
        xref="x", yref="y",
        x0=row['x_mean'] - row['x_std'], 
        y0=row['y_mean'] - row['y_std'],
        x1=row['x_mean'] + row['x_std'], 
        y1=row['y_mean'] + row['y_std'],
        line_color="black",
        opacity=0.3
    )

fig.show()

In [9]:
# 1. Calculate the "Center of Gravity" (Centroid) for each Chapter
# We group by chapter and take the mean of the PCA coordinates
chapter_centroids = df_pca.groupby('chapter')[['PC1', 'PC2']].mean().reset_index()

# 2. Perform K-Means Clustering on these 18 Centroids
# n_clusters=6 will force the algorithm to find the 6 distinct "neighborhoods"
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
chapter_centroids['cluster_label'] = kmeans.fit_predict(chapter_centroids[['PC1', 'PC2']])

# 3. Print the Groups
# This tells you exactly which chapters belong together
print("Suggested Chapter Groupings (6 Clusters):")
for cluster in sorted(chapter_centroids['cluster_label'].unique()):
    # Get the list of chapters in this cluster
    chapters = chapter_centroids[chapter_centroids['cluster_label'] == cluster]['chapter'].tolist()
    
    # Sort them for cleaner reading (handling if they are strings or ints)
    try:
        chapters = sorted(chapters, key=int)
    except:
        chapters = sorted(chapters)
        
    print(f"Group {cluster + 1}: Chapters {chapters}")

# 4. Map the new groups back to your main dataframe
# Create a dictionary: {Chapter_Name : Cluster_Label}
cluster_map = dict(zip(chapter_centroids['chapter'], chapter_centroids['cluster_label']))

# Create a new column 'cluster_group' for plotting
# We add 1 so the groups are named "Group 1" to "Group 6" instead of 0-5
df_pca['cluster_group'] = df_pca['chapter'].map(cluster_map).apply(lambda x: f"Group {x+1}")

# 5. Visualize the Segregated Clusters
fig = px.scatter(
    df_pca, 
    x='PC1', 
    y='PC2', 
    color='cluster_group',      # Color by the NEW group, not the old chapter
    hover_name='chapter',       # Hover still shows the specific chapter
    symbol='cluster_group',     # Optional: use different shapes for different groups
    title=f'PCA Map: Chapters Grouped by Centroid Proximity ({n_clusters} Clusters)',
    labels={'cluster_group': 'Semantic Cluster'},
    template='plotly_white',
    opacity=0.7
)

# Make markers readable
fig.update_traces(marker=dict(size=8))
fig.show()

Suggested Chapter Groupings (6 Clusters):
Group 1: Chapters [2, 3, 4, 14, 16, 17, 18]
Group 2: Chapters [7, 8, 9, 13, 15]
Group 3: Chapters [1]
Group 4: Chapters [10, 11]
Group 5: Chapters [5, 6, 12]


In [10]:
# 1. Calculate the Centroid for each Chapter in t-SNE space
# Note: We use 'x' and 'y' columns this time
chapter_centroids_tsne = df_tsne.groupby('chapter')[['x', 'y']].mean().reset_index()

# 2. Perform K-Means Clustering on these 18 Centroids
# We use 6 clusters to find the distinct semantic neighborhoods
n_clusters=5
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
chapter_centroids_tsne['cluster_label'] = kmeans.fit_predict(chapter_centroids_tsne[['x', 'y']])

# 3. Print the Groups
print("Suggested Chapter Groupings (t-SNE Based):")
for cluster in sorted(chapter_centroids_tsne['cluster_label'].unique()):
    # Get chapters in this cluster
    chapters = chapter_centroids_tsne[chapter_centroids_tsne['cluster_label'] == cluster]['chapter'].tolist()
    
    # Sort for readability
    try:
        chapters = sorted(chapters, key=int)
    except:
        chapters = sorted(chapters)
        
    print(f"Group {cluster + 1}: Chapters {chapters}")

# 4. Map the groups back to the main dataframe
cluster_map_tsne = dict(zip(chapter_centroids_tsne['chapter'], chapter_centroids_tsne['cluster_label']))
df_tsne['cluster_group'] = df_tsne['chapter'].map(cluster_map_tsne).apply(lambda x: f"Group {x+1}")

# 5. Visualize
fig = px.scatter(
    df_tsne, 
    x='x', 
    y='y', 
    color='cluster_group',
    hover_name='chapter',
    symbol='cluster_group',
    title=f't-SNE Map: Chapters Grouped by Proximity ({n_clusters} Clusters)',
    labels={'cluster_group': 'Semantic Neighborhood'},
    template='plotly_white',
    opacity=0.7
)

fig.update_traces(marker=dict(size=8))
fig.show()

Suggested Chapter Groupings (t-SNE Based):
Group 1: Chapters ['3', '5', '6', '12', '14', '18']
Group 2: Chapters ['1', '10', '11']
Group 3: Chapters ['4', '7', '9', '16']
Group 4: Chapters ['2', '8', '13', '15']
Group 5: Chapters ['17']
