<a href="https://colab.research.google.com/github/alyazone/Quranic-Theme-Extraction-Visualization/blob/main/ahmedali_refined_extractions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install torch



In [None]:
import pandas as pd
import torch
import re
from transformers import pipeline

Step 1: Clean and Refine Keywords

In [None]:
# Define function to clean RAKE keywords
def clean_keywords(keywords):
    """
    Clean RAKE-extracted keywords by removing special characters,
    extra spaces, and converting to lowercase.
    """
    keywords = re.sub(r'[^\w\s,]', '', keywords)  # Remove special characters
    keywords = keywords.strip().lower()  # Convert to lowercase and strip spaces
    return keywords.split(",")  # Split into a list

# Define function to refine keywords using LLM
def refine_keywords_with_llm(keywords, theme):
    """
    Refine keywords using an LLM to remove irrelevant terms
    and expand with synonyms related to the theme.
    """
    generator = pipeline("text-generation", model="EleutherAI/gpt-neo-2.7B")
    prompt = (
        f"Here are some extracted keywords: {keywords}. "
        f"The focused theme is '{theme}'. "
        f"Please remove irrelevant keywords and suggest relevant ones."
    )
    refined = generator(
        prompt,
        max_new_tokens=50,  # Limit the number of new tokens generated
        truncation=True,  # Explicitly enable truncation of the input if necessary
        pad_token_id=generator.tokenizer.eos_token_id  # Ensure proper padding
    )
    return refined[0]["generated_text"]

# Define mapping of keywords to themes
keyword_to_theme = {
    "truth": "Truthfulness",
    "truthfulness": "Truthfulness",
    "truthful": "Truthfulness",
    "Truthfulness": "Truthfulness",
    "honesty": "Truthfulness",
    "honest": "Truthfulness",
    "sincerity": "Truthfulness",
    "sincere": "Truthfulness",
    "sincerity": "Truthfulness",
    "integrity": "Truthfulness",
    "Sincerity": "Truthfulness",
    "candor": "Truthfulness",
    "veracity": "Truthfulness",
    "forgiveness": "Forgiveness",
    "mercy": "Forgiveness",
    "Forgiving": "Forgiveness",
    "forgive": "Forgiveness",
    "forgiving": "Forgiveness",
    "forgave": "Forgiveness",
    "repent": "Forgiveness",
    "repentence": "Forgiveness",
    "merciful": "Forgiveness",
    "Merciful": "Forgiveness",
    "pardon": "Forgiveness",
    "compassion": "Forgiveness",
    "Compassion": "Forgiveness",
    "patience": "Patience",
    "Patience": "Patience",
    "sabr": "Patience",
    "tolerance": "Patience",
    "Tolerance": "Patience",
    "endurance": "Patience",
    "resilience": "Patience",
    "perseverance": "Patience",
    "patient": "Patience",
    "persevere": "Patience",
    "serenity": "Patience",
    "steadfastness": "Patience",
    "gratitude": "Gratitude",
    "Gratitude": "Gratitude",
    "appreciation": "Gratitude",
    "Appreciation": "Gratitude",
    "appreciate": "Gratitude",
    "Thankful": "Gratitude",
    "thankful": "Gratitude",
    "thankfulness": "Gratitude",
    "Grateful": "Gratitude",
    "grateful": "Gratitude",
    "gratefulness": "Gratitude",
    "recognition": "Gratitude",
    "acknowledgement": "Gratitude",
    "obligation": "Gratitude",
    "indebtedness": "Gratitude",
    "Obligation": "Gratitude"
}

# Function to assign themes based on refined keywords
def assign_themes(refined_keywords):
    themes = set()
    for keyword in refined_keywords:
        keyword = keyword.strip()  # Clean each keyword
        if keyword in keyword_to_theme:
            themes.add(keyword_to_theme[keyword])
    return ", ".join(themes) if themes else "Unknown"


Step 2: Process Each File

In [None]:
# List of files with their corresponding themes
files = [
    ("ahmedali-forgiveness-themes.csv", "Forgiveness"),
    ("ahmedali-truthfulness-themes.csv", "Truthfulness"),
    ("ahmedali-patience-themes.csv.csv", "Patience"),
    ("ahmedali-gratitude-themes.csv", "Gratitude")
]

processed_data = []

# Process each file
for file, theme in files:
    # Load the dataset
    df = pd.read_csv(file)

    # Step 1: Clean the Extracted Keywords
    df["Cleaned Keywords"] = df["Extracted Keywords"].apply(clean_keywords)

    # Step 2: Refine Keywords with LLM
    df["Refined Keywords"] = df.apply(
        lambda row: refine_keywords_with_llm(row["Cleaned Keywords"], theme),
        axis=1
    )

    # Step 3: Assign Themes
    df["Themes"] = df["Refined Keywords"].apply(assign_themes)

    # Append processed DataFrame to the list
    processed_data.append(df)


Step 3: Merge and Save Final Dataset

In [None]:
# Merge all processed datasets
final_df = pd.concat(processed_data, ignore_index=True)

# Drop intermediate columns if not needed
final_df = final_df.drop(columns=["Cleaned Keywords", "Refined Keywords"])

# Save the final dataset
final_df.to_csv("final_dataset.csv", index=False)

# Display the resulting DataFrame
print("Final dataset saved as 'final_dataset.csv'")
print(final_df.head())
