In [1]:
# %pip install pandas
# %pip install reportlab
# %pip install nltk
# %pip install langchain-together
# %pip install loguru
# %pip install sentence-transformers scikit-learn
#%pip install matplotlib

In [2]:
import pandas as pd
from reportlab.lib.pagesizes import letter, A4
from reportlab.pdfgen import canvas
from reportlab.lib.utils import simpleSplit
import os
import time
import random
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from loguru import logger
from langchain_together import Together
from tqdm.notebook import tqdm  # Progress bar
import ipywidgets as widgets
from IPython.display import display
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
import numpy as np

  from tqdm.autonotebook import tqdm, trange





In [3]:
logger.add("biography_generator.log", rotation="1 MB")

nltk.download('punkt')

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asha4\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
TOGETHER_API_KEYS = [
    "cc242f9a074c36a00aeded59331d7c67780ed6078a9481c0dc35b2d95831f2c7",
    "89f6359c5009e4d13ca731bb9085bf686123aa816e64ede39a6b295143c086c3",
    "104dd97ba7c157e74fd5bda4afcad7774ff340adeb30773c3c0a7639e4fae45e",
    "126022cbbf2d4e73287470a6cafe29a87a3423b0c0511c2a68c9da83f16f2665"
    # Add more API keys as needed
]

In [5]:
# Function to read CSV and extract data
def read_csv(file_path):
    try:
        df = pd.read_csv(file_path, sep='\t')
        logger.info(f"Successfully read CSV file: {file_path}")
        return df
    except Exception as e:
        logger.error(f"Error reading CSV file: {file_path} - {e}")
        return None

# Function to extract rows with specific prefix in 'Sprecher' column
def extract_rows_with_sprecher(df, sprecher_prefix):
    try:
        df = df.dropna(subset=['Sprecher'])
        filtered_rows = df[df['Sprecher'].str.startswith(sprecher_prefix)]
        transkript_list = filtered_rows['Transkript'].tolist()
        logger.info(f"Extracted rows with Sprecher starting with '{sprecher_prefix}'")
        return transkript_list
    except Exception as e:
        logger.error(f"Error extracting rows: {e}")
        return []

# Function to convert list to string
def transkript_to_string(transkript_list):
    return "\n".join(transkript_list)


In [6]:
# Function to calculate the number of tokens in a text
def count_tokens(text):
    return len(text.split())

# Function to divide text into chunks within the token limit
def divide_into_chunks(text, max_words_per_chunk, max_tokens):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_word_count = 0
    current_token_count = 0

    for sentence in sentences:
        words_in_sentence = len(sentence.split())
        tokens_in_sentence = count_tokens(sentence)
        if current_word_count + words_in_sentence > max_words_per_chunk or current_token_count + tokens_in_sentence > max_tokens:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_word_count = words_in_sentence
            current_token_count = tokens_in_sentence
        else:
            current_chunk.append(sentence)
            current_word_count += words_in_sentence
            current_token_count += tokens_in_sentence

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

# Adjusted chunk size to provide more context for detailed summaries
MAX_WORDS_PER_CHUNK = 1500  # Reduced chunk size to stay within token limits
MAX_TOKENS = 3073  # 4097 (max tokens) - 1024 (max new tokens) = 3073


In [7]:
import time

# Summarizer class with multiple API keys support
class Summarizer:
    def __init__(self, api_keys):
        self.api_keys = api_keys
        self.current_key_index = 0
        self.llm = self.create_llm(api_keys[self.current_key_index])
        self.llm1 = self.create_llm(api_keys[self.current_key_index])

    def create_llm(self, api_key):
        return Together(
            model="meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
            temperature=0.7,
            max_tokens=1024,
            top_k=1,
            together_api_key=api_key
        )

    def switch_api_key(self):
        self.current_key_index = (self.current_key_index + 1) % len(self.api_keys)
        new_key = self.api_keys[self.current_key_index]
        self.llm = self.create_llm(new_key)
        self.llm1 = self.create_llm(new_key)
        logger.info(f"Switched to new API key: {new_key}")

    def generate_biography(self, input_text):
        prompt_template = """
Sie sind ein professioneller deutscher Biografieschreiber. Fassen Sie den folgenden Text prägnant zusammen. Die Zusammenfassung sollte mindestens 400 Wörter umfassen und alle wichtigen Details enthalten.

Beispielzusammenfassung:
"Marie Curie wurde 1867 in Warschau geboren. Sie war eine polnische und französische Physikerin und Chemikerin, die für ihre bahnbrechende Forschung im Bereich der Radioaktivität bekannt ist. Sie erhielt zwei Nobelpreise, einen in Physik und einen in Chemie. Curie entdeckte die Elemente Radium und Polonium und entwickelte die Theorie der Radioaktivität. Sie gründete das Curie-Institut in Paris und in Warschau, das bis heute ein bedeutendes Zentrum für medizinische Forschung ist. Curie starb 1934 an aplastischer Anämie, die durch ihre langjährige Exposition gegenüber radioaktiven Materialien verursacht wurde."

Text:
{text}

Zusammenfassung:
"""
        prompt = prompt_template.format(text=input_text)
        full_input = prompt
        max_retries = 3  # Maximum number of retries
        retry_delay = 30  # Initial delay in seconds

        for attempt in range(max_retries):
            try:
                output_summary = self.llm.invoke(full_input)
                if len(output_summary.split()) < 400:  # Check if the output is shorter than expected
                    logger.info("Output too short, retrying with a modified prompt...")
                    full_input = prompt + "\nBitte geben Sie mehr Details."
                    time.sleep(retry_delay)  # Wait before retrying
                    retry_delay *= 2  # Exponential backoff
                    continue  # Retry with a modified prompt if the output is too short
                return output_summary
            except ValueError as e:
                if "credit limit exceeded" in str(e).lower():
                    logger.warning(f"Credit limit exceeded. Switching API key and retrying...")
                    self.switch_api_key()
                elif "rate limited" in str(e).lower():
                    logger.warning(f"Rate limit exceeded. Retrying in {retry_delay} seconds (attempt {attempt + 1}/{max_retries})...")
                    time.sleep(retry_delay)
                    retry_delay *= 2  # Exponential backoff
                elif "input validation error" in str(e).lower():
                    logger.error(f"Input validation error: {e}")
                    break
                else:
                    logger.error(f"An error occurred: {e}")
                    raise
            except Exception as e:
                logger.error(f"An error occurred: {e}")
                raise

        logger.error("Max retries reached. Failed to generate a detailed biography.")
        return "Die Zusammenfassung ist zu kurz. Weitere Details konnten nicht abgerufen werden."

    def final_biography(self, input_text):
        prompt1 = """
Sie sind ein professioneller deutscher Biografieschreiber. Verfassen Sie eine detaillierte Biografie des obigen Textes. Konzentrieren Sie sich auf alle wichtigen Ereignisse und vermeiden Sie Wiederholungen.

Text:
{text}

Biografie:
"""
        full_input = prompt1.format(text=input_text)
        max_retries = 5  # Maximum number of retries
        retry_delay = 30  # Initial delay in seconds

        for attempt in range(max_retries):
            try:
                output_summary = self.llm1.invoke(full_input)
                return output_summary
            except ValueError as e:
                if "credit limit exceeded" in str(e).lower():
                    logger.warning(f"Credit limit exceeded. Switching API key and retrying...")
                    self.switch_api_key()
                elif "rate limited" in str(e).lower():
                    logger.warning(f"Rate limit exceeded. Retrying in {retry_delay} seconds (attempt {attempt + 1}/{max_retries})...")
                    time.sleep(retry_delay)
                    retry_delay = min(retry_delay * 2, 300)  # Exponential backoff, cap at 5 minutes
                elif "input validation error" in str(e).lower():
                    logger.error(f"Input validation error: {e}")
                    break
                else:
                    logger.error(f"An error occurred: {e}")
                    raise
            except Exception as e:
                logger.error(f"An error occurred: {e}")
                raise

        logger.error("Max retries reached. Failed to generate the final biography.")
        return "Die Biografie konnte nicht vollständig erstellt werden."


In [8]:
# Function to remove duplicate sentences using clustering
def remove_duplicates(text):
    sentences = sent_tokenize(text)
    if not sentences:
        return text
    embeddings = model.encode(sentences)
    clustering = DBSCAN(eps=0.3, min_samples=1, metric='cosine').fit(embeddings)  # Adjusted eps for less aggressive clustering
    unique_sentences = []
    seen_clusters = set()
    for cluster_id in clustering.labels_:
        if cluster_id not in seen_clusters:
            cluster_indices = [i for i, label in enumerate(clustering.labels_) if label == cluster_id]
            if cluster_indices:
                representative_sentence = sentences[cluster_indices[0]]
                unique_sentences.append(representative_sentence)
                seen_clusters.add(cluster_id)
    return ' '.join(unique_sentences)

# Stateless function to generate biography
def generate_biography(input_text, summarizer):
    chunks = divide_into_chunks(input_text, MAX_WORDS_PER_CHUNK, MAX_TOKENS)
    summaries = []
    chunk_number = 1

    for chunk in tqdm(chunks, desc="Generating biography"):
        logger.info(f"Processing chunk {chunk_number}")
        chunk_summary = summarizer.generate_biography(chunk)
        logger.debug(f"Generated chunk summary: {chunk_summary}")
        chunk_summary = remove_duplicates(chunk_summary)
        logger.debug(f"Chunk summary after removing duplicates: {chunk_summary}")
        print(f"Chunk {chunk_number} output after removing duplicates:\n{chunk_summary}\n")
        summaries.append(chunk_summary)
        chunk_number += 1

    combined_summary = "\n".join(summaries)
    return combined_summary

# Function to generate final biography (remains the same)
def final_biography(summarizer, input_text):
    chunks = divide_into_chunks(input_text, MAX_WORDS_PER_CHUNK, MAX_TOKENS)
    summary = ""
    chunk_number = 1

    for chunk in tqdm(chunks, desc="Generating final biography"):
        logger.info(f"Processing final chunk {chunk_number}")
        chunk_summary = summarizer.final_biography(chunk)
        logger.debug(f"Generated final chunk summary: {chunk_summary}")
        chunk_summary = remove_duplicates(chunk_summary)
        logger.debug(f"Final chunk summary after removing duplicates: {chunk_summary}")
        print(f"Final chunk {chunk_number} output after removing duplicates:\n{chunk_summary}\n")
        summary += chunk_summary + "\n"
        chunk_number += 1

    return summary


In [9]:
# Function to save biography to PDF
def save_biography_to_pdf(text, original_csv_filename):
    try:
        timestamp = int(time.time())
        base_name = os.path.splitext(os.path.basename(original_csv_filename))[0]
        filename = f"{base_name}_{timestamp}.pdf"
        
        c = canvas.Canvas(filename, pagesize=A4)
        text = text.split('\n')
        text = [line.strip() for line in text if line.strip() != '']
        width, height = A4
        left_margin = 72
        right_margin = width - 72
        y = height - 72  # Start writing from the top of the page
        max_width = right_margin - left_margin
        
        for line in text:
            wrapped_lines = simpleSplit(line, "Helvetica", 12, max_width)
            for wrapped_line in wrapped_lines:
                c.drawString(left_margin, y, wrapped_line)
                y -= 15  # Move to the next line
                if y < 72:
                    c.showPage()  # Add a new page if the current is full
                    y = height - 72  # Reset y coordinate
        
        c.save()
        logger.info(f"Biography saved to {filename}")
        return filename
    
    except Exception as e:
        logger.error(f"An error occurred while saving the PDF: {e}")
        return None


In [None]:
# Main workflow

# Use widgets for interactive parameter input
csv_file_path_widget = widgets.Text(
    value="C:/Users/asha4/OneDrive - SRH/Case Study-1/Dennis- Files/WG_ [EXTERN]  Transcripts and Biographies/adg0001_er_2024_04_23.csv",
    description="CSV File Path:",
    layout=widgets.Layout(width='100%')
)

sprecher_prefix_widget = widgets.Text(
    value='IP_',
    description="Sprecher Prefix:"
)

display(csv_file_path_widget, sprecher_prefix_widget)

def run_workflow(csv_file_path, sprecher_prefix):
    df = read_csv(csv_file_path)

    if df is not None:
        transkript_list = extract_rows_with_sprecher(df, sprecher_prefix)
        transcript_data = transkript_to_string(transkript_list)

        summarizer = Summarizer(api_keys=TOGETHER_API_KEYS)
        
        # Generate summaries for each chunk
        chunk_summaries = generate_biography(transcript_data, summarizer)
        
        # Combine summaries into one text
        combined_summary = "\n".join(chunk_summaries)
        
        # Generate final biography from combined summary
        final_bio = final_biography(summarizer, combined_summary)

        # Save the final biography to a PDF file
        filename = save_biography_to_pdf(final_bio, csv_file_path)
        if filename:
            logger.info(f"Biography saved to {filename}")
            print(f"Biography saved to {filename}")
        else:
            logger.error("Failed to save biography to PDF.")
    else:
        logger.error("Failed to read the CSV file.")

# Button to execute the workflow
run_button = widgets.Button(description="Run Workflow")

def on_run_button_clicked(b):
    run_workflow(csv_file_path_widget.value, sprecher_prefix_widget.value)

run_button.on_click(on_run_button_clicked)
display(run_button)


In [11]:
# value = "C:/Users/asha4/OneDrive - SRH/Case Study-1/Dennis- Files/WG_ [EXTERN]  Transcripts and Biographies/adg0001_er_2024_04_23.csv",
# value = "C:/Users/asha4/OneDrive - SRH/Case Study-1/Dennis- Files/WG_ [EXTERN]  Transcripts and Biographies/adg0002_er_2024_04_23.csv",