<a href="https://colab.research.google.com/github/alyazone/Quranic-Theme-Extraction-Visualization/blob/main/ahmedali_refined_extractions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import userdata
userdata.get('HF_TOKEN')

'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'

In [2]:
import pandas as pd
import torch
import re
from transformers import pipeline
import time

In [3]:
# Function to clean RAKE keywords
def clean_keywords(keywords):
    if isinstance(keywords, list):
        # Flatten the list
        keywords = [item for sublist in keywords for item in (sublist if isinstance(sublist, list) else [sublist])]
        keywords = " ".join(keywords)  # Convert to space-separated string
    elif isinstance(keywords, str):
        keywords = keywords
    else:
        keywords = str(keywords)

    keywords = re.sub(r"[^\w\s]", "", keywords)  # Remove special characters
    keywords = keywords.strip().lower()  # Convert to lowercase and strip spaces
    return keywords

print("Cleaned Keywords:", df["Cleaned Keywords"].head())

# Function to refine keywords using LLM
def refine_keywords_with_llm(keywords_list, theme, batch_size=10, max_input_length=512):
    generator = pipeline("text-generation", model="EleutherAI/gpt-neo-2.7B", device=0)
    refined_results = []
    for i in range(0, len(keywords_list), batch_size):

      print("Keywords List (before processing):", keywords_list[:5])
      print("Type of each item:", [type(k) for k in keywords_list[:5]])

        for keywords in keywords_list[i:i + batch_size]:
            print("Keywords (current):", keywords, "Type:", type(keywords))
            batch_prompts.append(
                f"Here are some extracted keywords: {str(keywords)[:max_input_length]}. "
                f"The focused theme is '{theme}'. Please remove irrelevant keywords and suggest relevant ones."
            )
        try:
            results = generator(
                batch_prompts,
                max_new_tokens=50,
                truncation=True,
                pad_token_id=generator.tokenizer.eos_token_id,
            )
            refined_results.extend([result["generated_text"] for result in results])
        except RuntimeError as e:
            print(f"Batch failed due to memory error: {e}")
            refined_results.extend(["Error processing"] * len(batch_prompts))
    return refined_results

# Define mapping of keywords to themes
keyword_to_theme = {
    "truth": "Truthfulness",
    "truthfulness": "Truthfulness",
    "truthful": "Truthfulness",
    "Truthfulness": "Truthfulness",
    "honesty": "Truthfulness",
    "honest": "Truthfulness",
    "sincerity": "Truthfulness",
    "sincere": "Truthfulness",
    "sincerity": "Truthfulness",
    "integrity": "Truthfulness",
    "Sincerity": "Truthfulness",
    "candor": "Truthfulness",
    "veracity": "Truthfulness",
    "forgiveness": "Forgiveness",
    "mercy": "Forgiveness",
    "Forgiving": "Forgiveness",
    "forgive": "Forgiveness",
    "forgiving": "Forgiveness",
    "forgave": "Forgiveness",
    "repent": "Forgiveness",
    "repentence": "Forgiveness",
    "merciful": "Forgiveness",
    "Merciful": "Forgiveness",
    "pardon": "Forgiveness",
    "compassion": "Forgiveness",
    "Compassion": "Forgiveness",
    "patience": "Patience",
    "Patience": "Patience",
    "sabr": "Patience",
    "tolerance": "Patience",
    "Tolerance": "Patience",
    "endurance": "Patience",
    "resilience": "Patience",
    "perseverance": "Patience",
    "patient": "Patience",
    "persevere": "Patience",
    "serenity": "Patience",
    "steadfastness": "Patience",
    "gratitude": "Gratitude",
    "Gratitude": "Gratitude",
    "appreciation": "Gratitude",
    "Appreciation": "Gratitude",
    "appreciate": "Gratitude",
    "Thankful": "Gratitude",
    "thankful": "Gratitude",
    "thankfulness": "Gratitude",
    "Grateful": "Gratitude",
    "grateful": "Gratitude",
    "gratefulness": "Gratitude",
    "recognition": "Gratitude",
    "acknowledgement": "Gratitude",
    "obligation": "Gratitude",
    "indebtedness": "Gratitude",
    "Obligation": "Gratitude"
}

# Function to assign themes
def assign_themes(refined_keywords):
    themes = set()
    for keyword in refined_keywords.split(","):
        keyword = keyword.strip()
        if keyword in keyword_to_theme:
            themes.add(keyword_to_theme[keyword])
    return ", ".join(themes) if themes else "Unknown"



In [4]:
# List of files with their corresponding themes
files = [
    ("ahmedali-forgiveness-themes.csv", "Forgiveness"),
    ("ahmedali-truthfulness-themes.csv", "Truthfulness"),
    ("ahmedali-patience-themes.csv", "Patience"),
    ("ahmedali-gratitude-themes.csv", "Gratitude")
]

processed_data = []

start_time = time.time()

for file, theme in files:
    try:
        print(f"Processing file: {file} for theme: {theme}")
        df = pd.read_csv(file)

        # Check the column exists
        if "Extracted Keywords" not in df.columns:
            raise KeyError(f"File {file} does not have 'Extracted Keywords' column.")

        # Debug: Inspect raw keywords
        print("Raw Extracted Keywords:", df["Extracted Keywords"].head())

        # Step 1: Clean the Extracted Keywords
        df["Cleaned Keywords"] = df["Extracted Keywords"].apply(
            lambda x: clean_keywords(x) if isinstance(x, list) else str(x)
        )
        print("Cleaned Keywords (after processing):", df["Cleaned Keywords"].head())

        # Step 2: Refine Keywords with LLM
        chunk_size = 100
        chunks = [df["Cleaned Keywords"].iloc[i:i + chunk_size].tolist() for i in range(0, len(df), chunk_size)]
        refined_keywords = []
        for chunk in chunks:
            refined_keywords.extend(refine_keywords_with_llm(chunk, theme))
        df["Refined Keywords"] = refined_keywords
        print("Refined Keywords:", df["Refined Keywords"].head())

        # Step 3: Assign Themes
        df["Themes"] = df["Refined Keywords"].apply(assign_themes)
        print("Themes:", df["Themes"].head())

        processed_data.append(df)

    except Exception as e:
        print(f"Error processing file {file}: {e}")

Processing file: ahmedali-forgiveness-themes.csv for theme: Forgiveness
Raw Extracted Keywords: 0    ['name', 'merciful', 'ever', 'benevolent', 'al...
1                   ['merciful', 'ever', 'beneficent']
2    ['lord sent commands', 'turned towards', 'kind...
3              ['pardoned', 'may', 'grateful', 'even']
4    ['softened towards', 'moses said', 'lord ."', ...
Name: Extracted Keywords, dtype: object
Cleaned Keywords (after processing): 0    ['name', 'merciful', 'ever', 'benevolent', 'al...
1                   ['merciful', 'ever', 'beneficent']
2    ['lord sent commands', 'turned towards', 'kind...
3              ['pardoned', 'may', 'grateful', 'even']
4    ['softened towards', 'moses said', 'lord ."', ...
Name: Cleaned Keywords, dtype: object
Error processing file ahmedali-forgiveness-themes.csv: list indices must be integers or slices, not str
Processing file: ahmedali-truthfulness-themes.csv for theme: Truthfulness
Raw Extracted Keywords: 0    ['surah like', 'like', 'witness'