<a href="https://colab.research.google.com/github/alyazone/Quranic-Theme-Extraction-Visualization/blob/main/ahmedali_refined_extractions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from huggingface_hub import login

# Authenticate with Hugging Face
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
import pandas as pd
import torch
import re
from transformers import pipeline
import time

In [3]:
# Define function to clean RAKE keywords
def clean_keywords(keywords):
    keywords = re.sub(r"[^\w\s,]", "", keywords)  # Remove special characters
    keywords = keywords.strip().lower()  # Convert to lowercase and strip spaces
    return keywords.split(",")  # Split into a list

# Define function to refine keywords using LLM
def refine_keywords_with_llm(keywords_list, theme, batch_size=10, max_input_length=512):
    generator = pipeline("text-generation", model="EleutherAI/gpt-neo-2.7B", device=0)
    refined_results = []
    for i in range(0, len(keywords_list), batch_size):
        batch_prompts = [
            f"Here are some extracted keywords: {str(keywords)[:max_input_length]}. "
            f"The focused theme is '{theme}'. Please remove irrelevant keywords and suggest relevant ones."
            for keywords in keywords_list[i:i + batch_size]
        ]
        try:
            results = generator(
                batch_prompts,
                max_new_tokens=50,
                truncation=True,
                pad_token_id=generator.tokenizer.eos_token_id,
            )
            refined_results.extend([result["generated_text"] for result in results])
        except RuntimeError as e:
            print(f"Batch failed due to memory error: {e}")
            refined_results.extend(["Error processing"] * len(batch_prompts))
    return refined_results

# Define mapping of keywords to themes
keyword_to_theme = {
    "truth": "Truthfulness",
    "truthfulness": "Truthfulness",
    "truthful": "Truthfulness",
    "Truthfulness": "Truthfulness",
    "honesty": "Truthfulness",
    "honest": "Truthfulness",
    "sincerity": "Truthfulness",
    "sincere": "Truthfulness",
    "sincerity": "Truthfulness",
    "integrity": "Truthfulness",
    "Sincerity": "Truthfulness",
    "candor": "Truthfulness",
    "veracity": "Truthfulness",
    "forgiveness": "Forgiveness",
    "mercy": "Forgiveness",
    "Forgiving": "Forgiveness",
    "forgive": "Forgiveness",
    "forgiving": "Forgiveness",
    "forgave": "Forgiveness",
    "repent": "Forgiveness",
    "repentence": "Forgiveness",
    "merciful": "Forgiveness",
    "Merciful": "Forgiveness",
    "pardon": "Forgiveness",
    "compassion": "Forgiveness",
    "Compassion": "Forgiveness",
    "patience": "Patience",
    "Patience": "Patience",
    "sabr": "Patience",
    "tolerance": "Patience",
    "Tolerance": "Patience",
    "endurance": "Patience",
    "resilience": "Patience",
    "perseverance": "Patience",
    "patient": "Patience",
    "persevere": "Patience",
    "serenity": "Patience",
    "steadfastness": "Patience",
    "gratitude": "Gratitude",
    "Gratitude": "Gratitude",
    "appreciation": "Gratitude",
    "Appreciation": "Gratitude",
    "appreciate": "Gratitude",
    "Thankful": "Gratitude",
    "thankful": "Gratitude",
    "thankfulness": "Gratitude",
    "Grateful": "Gratitude",
    "grateful": "Gratitude",
    "gratefulness": "Gratitude",
    "recognition": "Gratitude",
    "acknowledgement": "Gratitude",
    "obligation": "Gratitude",
    "indebtedness": "Gratitude",
    "Obligation": "Gratitude"
}

# Function to assign themes based on refined keywords
# Define function to assign themes
def assign_themes(refined_keywords):
    themes = set()
    for keyword in refined_keywords.split(","):
        keyword = keyword.strip()
        if keyword in keyword_to_theme:
            themes.add(keyword_to_theme[keyword])
    return ", ".join(themes) if themes else "Unknown"



In [4]:
# List of files with their corresponding themes
files = [
    ("ahmedali-forgiveness-themes.csv", "Forgiveness"),
    ("ahmedali-truthfulness-themes.csv", "Truthfulness"),
    ("ahmedali-patience-themes.csv", "Patience"),
    ("ahmedali-gratitude-themes.csv", "Gratitude")
]

processed_data = []

start_time = time.time()

# Process each file
for file, theme in files:
    try:
        # Load the dataset
        print(f"Processing file: {file} for theme: {theme}")
        df = pd.read_csv(file)
        if "Extracted Keywords" not in df.columns:
            raise KeyError(f"File {file} does not have 'Extracted Keywords' column.")

        # Clean the Extracted Keywords
        df["Cleaned Keywords"] = df["Extracted Keywords"].apply(clean_keywords)
        df["Cleaned Keywords"] = df["Cleaned Keywords"].apply(
            lambda x: " ".join(x) if isinstance(x, list) else str(x)
        )

        # Refine Keywords with LLM
        chunk_size = 100
        chunks = [df["Cleaned Keywords"].iloc[i:i + chunk_size].tolist() for i in range(0, len(df), chunk_size)]
        refined_keywords = []
        for chunk in chunks:
            refined_keywords.extend(refine_keywords_with_llm(chunk, theme))
        df["Refined Keywords"] = refined_keywords

        # Assign Themes
        df["Themes"] = df["Refined Keywords"].apply(assign_themes)

        # Append processed DataFrame
        processed_data.append(df)

    except Exception as e:
        print(f"Error processing file {file}: {e}")

# Merge all processed datasets
if len(processed_data) > 0:
    final_df = pd.concat(processed_data, ignore_index=True)
    final_df = final_df.drop(columns=["Cleaned Keywords", "Refined Keywords"])
    final_df.to_csv("fine-tuning-dataset.csv", index=False)
    print("Final dataset saved as 'fine-tuning-dataset.csv'")
    print(final_df.head())
else:
    print("No data was processed. Check your input files.")

Step 1: Cleaned Keywords Time: 0.011887311935424805


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Error processing file ahmedali-forgiveness-themes.csv: list indices must be integers or slices, not str
Step 1: Cleaned Keywords Time: 20.657161235809326
Error processing file ahmedali-truthfulness-themes.csv: list indices must be integers or slices, not str
Error processing file ahmedali-patience-themes.csv.csv: [Errno 2] No such file or directory: 'ahmedali-patience-themes.csv.csv'
Step 1: Cleaned Keywords Time: 40.19302463531494
Error processing file ahmedali-gratitude-themes.csv: list indices must be integers or slices, not str


ValueError: No objects to concatenate