<a href="https://colab.research.google.com/github/alyazone/Quranic-Theme-Extraction-Visualization/blob/main/post_process_RAKE_mapping_themes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import userdata
userdata.get('HF_TOKEN')

'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'

In [None]:
import pandas as pd
import ast
import re

# Function to clean and flatten RAKE keywords
def post_process_keywords(keywords):
    if not isinstance(keywords, list):
        try:
            keywords = ast.literal_eval(keywords)  # Convert string to list
        except Exception:
            return ["none"]  # Fallback for malformed input

    # Stopword removal and cleaning logic remains the same
    stopwords = {"and", "or", "the", "of", "in", "to", "is", "for", "with", "by", "a", "an"}
    cleaned_keywords = [re.sub(r"[^\w\s]", "", kw).strip().lower()
                        for kw in keywords if len(kw) > 2 and kw.lower() not in stopwords]

    return cleaned_keywords if cleaned_keywords else ["none"]

In [None]:
!pip install sentence-transformers



In [None]:
from sentence_transformers import SentenceTransformer, util

# Load a semantic similarity model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Define theme-related terms
theme_keywords = {
    "Forgiveness": ["forgiveness", "forgive", "forgiving", "forgave", "mercy", "repentance", "repent", "compassion", "pardon"],
    "Truthfulness": ["truthfulness", "truthful", "truth", "honesty", "honest", "integrity", "sincerity", "veracity"],
    "Patience": ["patience", "sabr", "endurance", "perseverance", "resilience", "tolerance", "serenity", "steadfastness"],
    "Gratitude": ["gratitude", "thankfulness", "appreciation", "gratefulness", "thankful", "grateful", "ungrateful",
                  "recognition", "acknowledgement", "obligation", "obligations", "indebtedness"],
}

# Pre-encode theme-related terms once
theme_embeddings = {theme: model.encode(terms, convert_to_tensor=True) for theme, terms in theme_keywords.items()}

def map_keywords_to_themes(keywords):
    theme_scores = {theme: 0 for theme in theme_keywords}

    # Normalize keywords to lowercase
    keywords = [keyword.lower() for keyword in keywords]

    # Compute similarity for each keyword
    for keyword in keywords:
        keyword_embedding = model.encode(keyword, convert_to_tensor=True)
        for theme, embeddings in theme_embeddings.items():
            similarity = util.cos_sim(keyword_embedding, embeddings).max().item()
            if similarity > 0.4:  # Threshold for matching
                theme_scores[theme] += 1

    # Assign the theme with the highest score or fallback to 'Other'
    return max(theme_scores, key=theme_scores.get) if max(theme_scores.values()) > 0 else "Other"


In [None]:
def filter_refined_keywords(keywords, theme):
    """
    Retains only the core keywords related to the theme based on semantic similarity.
    """
    theme_terms = theme_keywords[theme]
    theme_embeddings = model.encode(theme_terms, convert_to_tensor=True)
    filtered_keywords = []

    for keyword in keywords:
        keyword_embedding = model.encode(keyword, convert_to_tensor=True)
        max_similarity = util.cos_sim(keyword_embedding, theme_embeddings).max().item()

        if max_similarity > 0.5:  # Adjust threshold as needed
            filtered_keywords.append(keyword)

    # Ensure every row has a valid value (fallback if no keywords match)
    return filtered_keywords if filtered_keywords else ["none"]

In [None]:
# Function to process a single file
def process_file(file, theme):
    df = pd.read_csv(file)

    # Convert keywords from string to list format
    df["Extracted Keywords"] = df["Extracted Keywords"].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) else x
    )

    # Step 1: Post-process RAKE keywords
    # df["Refined Keywords"] = df["Extracted Keywords"].apply(post_process_keywords)
    print(f"Before processing: Extracted Keywords (first 5 rows): {df['Extracted Keywords'].head()}")
    df["Refined Keywords"] = df["Extracted Keywords"].apply(post_process_keywords)
    print(f"After processing: Refined Keywords (first 5 rows): {df['Refined Keywords'].head()}")


    # Step 2: Map refined keywords to themes
    # df["Mapped Theme"] = df["Refined Keywords"].apply(map_keywords_to_themes)
    df["Mapped Theme"] = df["Refined Keywords"].apply(map_keywords_to_themes)
    print(df["Mapped Theme"].value_counts())


    # Step 3: Assign the original theme (optional for comparison)
    df["Original Theme"] = theme

    # Apply filtering after refinement
    df["Filtered Refined Keywords"] = df.apply(
        lambda row: filter_refined_keywords(row["Refined Keywords"], row["Mapped Theme"]),
        axis=1
    )

    # Debugging: Check rows with "none"
    none_count = (df["Filtered Refined Keywords"].apply(lambda x: x == ["none"])).sum()
    print(f"Rows with no relevant keywords (Filtered Refined Keywords == ['none']): {none_count}")

    other_rows = df[df["Mapped Theme"] == "Other"]
    print(f"Rows mapped to 'Other': {len(other_rows)}")
    print(other_rows.head())


    return df

# List of files with corresponding themes
files = [
    ("ahmedali-forgiveness-themes.csv", "Forgiveness"),
    ("ahmedali-truthfulness-themes.csv", "Truthfulness"),
    ("ahmedali-patience-themes.csv", "Patience"),
    ("ahmedali-gratitude-themes.csv", "Gratitude"),
]

processed_data = []

# Process all files
for file, theme in files:
    try:
        processed_file = process_file(file, theme)
        if processed_file is not None:
            processed_data.append(processed_file)
        else:
            print(f"File {file} was skipped due to an error.")
    except Exception as e:
        print(f"Error processing file {file}: {e}")

# Debugging: Check processed files
for i, data in enumerate(processed_data):
    print(f"Processed File {i + 1}: Rows = {len(data)}")

# Debugging: Check total rows across all processed files
total_rows = sum(len(data) for data in processed_data)
print(f"Total rows after merging all files: {total_rows}")

# Merge all processed data
if processed_data:
    final_df = pd.concat(processed_data, ignore_index=True)

    # Optional: Filter out rows with 'Other'
    print(f"Rows with 'Other' before filtering: {len(final_df[final_df['Mapped Theme'] == 'Other'])}")
    final_df = final_df[final_df["Mapped Theme"] != "Other"]
    print(f"Rows after filtering 'Other': {len(final_df)}")

    # Save the final dataset
    final_df.to_csv("filtered-fine-tuning-dataset.csv", index=False)
    print("Final dataset saved as 'filtered-fine-tuning-dataset.csv'")
else:
    print("No data processed.")

Before processing: Extracted Keywords (first 5 rows): 0            [name, merciful, ever, benevolent, allah]
1                         [merciful, ever, beneficent]
2    [lord sent commands, turned towards, kind, ind...
3                      [pardoned, may, grateful, even]
4    [softened towards, moses said, lord .", lord, ...
Name: Extracted Keywords, dtype: object
After processing: Refined Keywords (first 5 rows): 0            [name, merciful, ever, benevolent, allah]
1                         [merciful, ever, beneficent]
2    [lord sent commands, turned towards, kind, ind...
3                      [pardoned, may, grateful, even]
4    [softened towards, moses said, lord, lord, tur...
Name: Refined Keywords, dtype: object
Mapped Theme
Forgiveness     198
Truthfulness     99
Gratitude        40
Patience          1
Name: count, dtype: int64
Rows with no relevant keywords (Filtered Refined Keywords == ['none']): 70
Rows mapped to 'Other': 0
Empty DataFrame
Columns: [Chapter Number, Verse