<a href="https://colab.research.google.com/github/alyazone/Quranic-Theme-Extraction-Visualization/blob/main/ahmedali_refined_extractions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from huggingface_hub import login

# Authenticate with Hugging Face
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
import pandas as pd
import torch
import re
from transformers import pipeline
import time

In [3]:
# Define function to clean RAKE keywords
def clean_keywords(keywords):
    """
    Clean RAKE-extracted keywords by removing special characters,
    extra spaces, and converting to lowercase.
    """
    keywords = re.sub(r'[^\w\s,]', '', keywords)  # Remove special characters
    keywords = keywords.strip().lower()  # Convert to lowercase and strip spaces
    return keywords.split(",")  # Split into a list

# Define function to refine keywords using LLM
def refine_keywords_with_llm(keywords_list, theme, batch_size=10, max_input_length=512):
    """
    Refine a batch of keywords using an LLM with memory optimization.
    """
    generator = pipeline("text-generation", model="EleutherAI/gpt-neo-2.7B", device=0)
    refined_results = []
    for i in range(0, len(keywords_list), batch_size):
        batch_prompts = [
            f"Here are some extracted keywords: {keywords[:max_input_length]}. "
            f"The focused theme is '{theme}'. Please remove irrelevant keywords and suggest relevant ones."
            for keywords in keywords_list[i:i + batch_size]
        ]
        try:
            results = generator(
                batch_prompts,
                max_new_tokens=50,
                truncation=True,
                pad_token_id=generator.tokenizer.eos_token_id,
            )
            refined_results.extend([result["generated_text"] for result in results])
        except RuntimeError as e:
            print(f"Batch failed due to memory error: {e}")
            refined_results.extend(["Error processing"] * len(batch_prompts))
    return refined_results

# Define mapping of keywords to themes
keyword_to_theme = {
    "truth": "Truthfulness",
    "truthfulness": "Truthfulness",
    "truthful": "Truthfulness",
    "Truthfulness": "Truthfulness",
    "honesty": "Truthfulness",
    "honest": "Truthfulness",
    "sincerity": "Truthfulness",
    "sincere": "Truthfulness",
    "sincerity": "Truthfulness",
    "integrity": "Truthfulness",
    "Sincerity": "Truthfulness",
    "candor": "Truthfulness",
    "veracity": "Truthfulness",
    "forgiveness": "Forgiveness",
    "mercy": "Forgiveness",
    "Forgiving": "Forgiveness",
    "forgive": "Forgiveness",
    "forgiving": "Forgiveness",
    "forgave": "Forgiveness",
    "repent": "Forgiveness",
    "repentence": "Forgiveness",
    "merciful": "Forgiveness",
    "Merciful": "Forgiveness",
    "pardon": "Forgiveness",
    "compassion": "Forgiveness",
    "Compassion": "Forgiveness",
    "patience": "Patience",
    "Patience": "Patience",
    "sabr": "Patience",
    "tolerance": "Patience",
    "Tolerance": "Patience",
    "endurance": "Patience",
    "resilience": "Patience",
    "perseverance": "Patience",
    "patient": "Patience",
    "persevere": "Patience",
    "serenity": "Patience",
    "steadfastness": "Patience",
    "gratitude": "Gratitude",
    "Gratitude": "Gratitude",
    "appreciation": "Gratitude",
    "Appreciation": "Gratitude",
    "appreciate": "Gratitude",
    "Thankful": "Gratitude",
    "thankful": "Gratitude",
    "thankfulness": "Gratitude",
    "Grateful": "Gratitude",
    "grateful": "Gratitude",
    "gratefulness": "Gratitude",
    "recognition": "Gratitude",
    "acknowledgement": "Gratitude",
    "obligation": "Gratitude",
    "indebtedness": "Gratitude",
    "Obligation": "Gratitude"
}

# Function to assign themes based on refined keywords
def assign_themes(refined_keywords):
    if not isinstance(refined_keywords, list):
        refined_keywords = refined_keywords.split(",")  # Handle single string input
    themes = set()
    for keyword in refined_keywords:
        keyword = keyword.strip()  # Clean each keyword
        if keyword in keyword_to_theme:
            themes.add(keyword_to_theme[keyword])
    return ", ".join(themes) if themes else "Unknown"



In [4]:
# start_time = time.time()

# List of files with their corresponding themes
files = [
    ("ahmedali-forgiveness-themes.csv", "Forgiveness"),
    ("ahmedali-truthfulness-themes.csv", "Truthfulness"),
    ("ahmedali-patience-themes.csv.csv", "Patience"),
    ("ahmedali-gratitude-themes.csv", "Gratitude")
]

processed_data = []

try:
    for file, theme in files:
        df = pd.read_csv(file)
        if "Extracted Keywords" not in df.columns:
            raise KeyError(f"File {file} does not have 'Extracted Keywords' column.")

        step_start = time.time()
        df["Cleaned Keywords"] = df["Extracted Keywords"].apply(clean_keywords)
        print(f"Step 1: Cleaned Keywords Time: {time.time() - step_start}")

        step_start = time.time()
        chunk_size = 100
        chunks = [df["Cleaned Keywords"].iloc[i:i+chunk_size].tolist() for i in range(0, len(df), chunk_size)]
        refined_keywords = []
        for chunk in chunks:
            refined_keywords.extend(refine_keywords_with_llm(chunk, theme))
        df["Refined Keywords"] = refined_keywords
        print(f"Step 2: Refined Keywords Time: {time.time() - step_start}")

        step_start = time.time()
        df["Themes"] = df["Refined Keywords"].apply(assign_themes)
        print(f"Step 3: Themes Assignment Time: {time.time() - step_start}")

        processed_data.append(df)

    final_df = pd.concat(processed_data, ignore_index=True)
    final_df = final_df.drop(columns=["Cleaned Keywords", "Refined Keywords"])
    final_df.to_csv("fine-tuning-dataset.csv", index=False)
    print("Final dataset saved as 'fine-tuning-dataset.csv'")
    print(final_df.head())
except Exception as e:
    print(f"Error: {e}")


Cleaned Keywords Time: 0.009348630905151367


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:  28%|##7       | 2.97G/10.7G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

NameError: name 'keywords_list' is not defined