Restart the terminal after running the two blocks below

In [None]:
# %pip install pandas
# %pip install nltk
# %pip install fpdf
# %pip install transformers==4.43.1
# %pip install vllm==0.5.3.post1
# %pip install torch

In [None]:
# # Load model directly
# from transformers import AutoTokenizer, AutoModelForCausalLM

# token = "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"
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=token)
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=token)

In [None]:
import os
import tempfile
import gc
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from nltk.tokenize import word_tokenize, sent_tokenize
import shutil
import pandas as pd
from fpdf import FPDF
import nltk

nltk.download('punkt_tab')

In [None]:
if torch.cuda.is_available():
    print("CUDA is available! Using GPU.")
    device = torch.device("cuda")
else:
    print("CUDA is not available. Using CPU.")
    device = torch.device("cpu")

In [None]:
def extract_rows_with_sprecher(df, sprecher_prefix):
    df = df.dropna(subset=['Sprecher'])
    filtered_rows = df[df['Sprecher'].str.startswith(sprecher_prefix)]
    transkript_list = filtered_rows['Transkript'].tolist()
    return transkript_list

def transkript_to_string(transkript_list):
    return "\n".join(transkript_list)

In [None]:
def divide_into_chunks(text, max_words_per_chunk):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_word_count = 0

    for sentence in sentences:
        words_in_sentence = len(word_tokenize(sentence))
        if current_word_count + words_in_sentence > max_words_per_chunk:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_word_count = words_in_sentence
        else:
            current_chunk.append(sentence)
            current_word_count += words_in_sentence

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

#Use this if the above chunking generates a lot of load on memory

# def divide_into_chunks(text, max_words_per_chunk):
#     words = word_tokenize(text)
#     return [' '.join(words[i:i + max_words_per_chunk]) for i in range(0, len(words), max_words_per_chunk)]

In [None]:
class Summarizer:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = None
        self.model = None
        self.temp_dir = tempfile.mkdtemp()  # Initialize temporary directory
        self.load_model_once = False  # Flag to check if the model has been loaded

    def load_model(self):
        gc.collect()  # Clear CPU memory

        if not self.load_model_once:
            print("Loading model...")
            token = "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"
            try:
                self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", token=token)
                self.model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", token=token)
                self.model = self.model.to(self.device)  # Move the model to the appropriate device (CPU or GPU)
                print("Model and tokenizer successfully loaded.")
                self.load_model_once = True  # Set the flag to true after loading the model
            except Exception as e:
                print(f"Error loading model: {e}")
                raise
        else:
            print("Model already loaded and in use.")

    def forward_pass(self, input_ids):
        """Performs a forward pass using the model with the given input IDs."""
        with torch.no_grad():
            outputs = self.model.generate(
                input_ids=input_ids,
                max_length=self.max_tokens,
                temperature=self.temperature,
                top_k=self.top_k,
                num_return_sequences=1
            )
        return outputs

    def process_chunk(self, chunk, chunk_id):
        self.load_model()
        input_ids = self.tokenizer(chunk, return_tensors="pt").input_ids.to(self.device)

        with torch.no_grad():
            outputs = self.model.generate(
                input_ids=input_ids,
                max_new_tokens=self.max_tokens,
                temperature=self.temperature,
                top_k=self.top_k,
                num_return_sequences=1
            )

        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Clear memory after processing each chunk
        del input_ids, outputs
        gc.collect()

        return generated_text

    def process_all_chunks(self, chunk_prompts):
        """Processes all chunks by invoking process_chunk for each one with retry logic."""
        outputs = []
        for idx, prompt in enumerate(chunk_prompts):
            output = self.invoke_with_retry(self.process_chunk, prompt, chunk_id=idx)
            outputs.append(output)
        return " ".join(outputs)

    def clear_model(self):
        print("Clearing model from memory...")
        del self.model
        del self.tokenizer
        torch.cuda.empty_cache()  # Clear CUDA memory if used
        gc.collect()  # Force garbage collection to free memory

    def invoke_with_retry(self, func, *args, retries=3, **kwargs):
        for attempt in range(retries):
            try:
                return func(*args, **kwargs)
            except Exception as e:
                print(f"Attempt {attempt + 1} failed: {e}")
                if attempt == retries - 1:
                    raise  # Re-raise the exception if the last attempt fails

    def generate_biography(self, input_text):
        self.load_model()
        
        self.temperature = 0.1
        self.max_tokens = 1024
        self.top_k = 1
        
        prompt = """

I would like you to generate biography of an interviewee based on the following structured questions in German language and  written in third-person-view, make sure it is in german language only. Please address each question thoroughly, ensuring that the narrative flows smoothly from one life stage to the next. The biography should include the following information:

Birth and Early Family Life:

When and where was the interviewee born? Include the date and location of birth.
Who are the interviewee's parents? Provide their names, backgrounds, and any relevant details about their lives.
Does the interviewee have any siblings? If so, provide details about them, including names and relationships.
Education:

Which school or schools did the interviewee attend? Mention the date, names of the institutions, locations, and any significant experiences or achievements during their education.
Career and Professional Life:

What profession did the interviewee learn or train for? Mention date and describe the nature of their training or education in this field.
Which jobs or professions has the interviewee practiced? Include details about the dates, roles, companies, or organizations they worked for, and any significant milestones or achievements in their career.
Life Events and Personal Milestones:

What were the formative or significant life events with years mentioned in the interviewee's childhood? Mention dates, Include any experiences that had a lasting impact.
What were the formative or significant life events with years mentioned during the interviewee's adolescence?  Mention dates, Describe how these events influenced their path in life.
What were the formative or significant life events with years mentioned in the interviewee's early adult years? Include details about any dates, transitions, challenges, or accomplishments during this period.
What were the formative or significant life events with years mentioned during the interviewee's adult years?  Mention dates, Describe key experiences that shaped their personal or professional life.
What were the formative or significant life events with years mentioned in the interviewee's late adult years? Highlight any dates, major changes, achievements, or reflections during this time.
Personal Life:

Did the interviewee marry  with years mentioned? If yes, provide details about their spouse,dates, including the name and any significant information about their relationship.
Does the interviewee have children with years mentioned? If so, provide details about their children, dates, including names and any significant life events related to them.
Significant Life Events:

What are the most significant life events that have shaped the interviewee's life with years mentioned? Reflect on how these events with years mentioned impacted their personal growth, relationships, or career, dates.
Please ensure the biography is coherent, chronological, detailed, and presents a well-rounded view of the interviewee's life journey with years mentioned. Include years, don't forget any years mentioned in the interview.
   
           """
        
        chunks = divide_into_chunks(input_text, max_words_per_chunk=70000)
        output_biography = self.process_all_chunks([chunk + prompt for chunk in chunks])
        
        return output_biography

    def extend_biography(self, partial_biography, input_text):
        try:
            self.load_model()

            self.temperature = 0.1
            self.max_tokens = 800
            self.top_k = 1

            prompt = f"""
            Hier ist der erste Teil der Biografie: {partial_biography}
            Nun sehen Sie sich die Daten erneut an und ergänzen Sie die Biografie um die fehlenden Informationen.
            """

            chunks = divide_into_chunks(input_text, max_words_per_chunk=70000)

            extended_biography = self.process_all_chunks([chunk + prompt for chunk in chunks])

            return self.remove_incomplete_sentence(extended_biography)
        
        except Exception as e:
            print(f"An error occurred while extending the biography: {str(e)}")
            return None

    def refine_biography_to_500_words(self, extended_biography, input_text):
        try:
            self.load_model()

            self.temperature = 0.1
            self.max_tokens = 512
            self.top_k = 1

            prompt = f"""
            Hier ist der erste Teil der Biografie: {extended_biography}
            Nun sehen Sie sich die Daten erneut an und ergänzen Sie die Biografie um die fehlenden Informationen.
            """

            chunks = divide_into_chunks(input_text, max_words_per_chunk=70000)

            refined_biography = self.process_all_chunks([chunk + prompt for chunk in chunks])

            return self.remove_incomplete_sentence(refined_biography)
        
        except Exception as e:
            print(f"An error occurred while refining the biography: {str(e)}")
            return None
        
        finally:
            self.clear_model()

    def remove_incomplete_sentence(self, biography):
        words = nltk.word_tokenize(biography)
        if len(words) <= 800:
            return biography
        
        truncated_words = words[:800]
        truncated_text = " ".join(truncated_words)
        last_full_stop_index = truncated_text.rfind('.')
        
        if last_full_stop_index != -1:
            return truncated_text[:last_full_stop_index + 1]
        else:
            return truncated_text

In [None]:
def assemble_biography_from_disk(output_files):
    full_biography = []
    for output_file in output_files:
        with open(output_file, 'r') as f:
            full_biography.append(f.read().strip())

    return " ".join(full_biography)


def cleanup_temp_files(temp_dir):
    shutil.rmtree(temp_dir)  # Remove the temporary directory and its contents


def read_csv(file_path):
    try:
        df = pd.read_csv(file_path, sep='\t')
        return df
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None
        

def save_text_to_pdf(text, pdf_path):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    
    for line in text.split('\n'):
        pdf.multi_cell(0, 10, line)
    
    pdf.output(pdf_path)

In [None]:
def process_all_files_in_directory(directory_path):
    summarizer = Summarizer()
    output_directory = os.path.join(directory_path, "output_pdfs")
    
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    for file_name in os.listdir(directory_path):
        if not file_name.endswith('.csv'):
            print(f"Unsupported file format: {file_name}")
            continue
        
        file_path = os.path.join(directory_path, file_name)
        print(f"Processing file: {file_name}")
        
        df = read_csv(file_path)
        
        if df is not None:
            sprecher_prefix = 'IP_'
            transkript_list = extract_rows_with_sprecher(df, sprecher_prefix)
            transcript_data = transkript_to_string(transkript_list)
            
            print("Generating initial biography...")
            initial_biography = summarizer.generate_biography(transcript_data)
            print("Initial biography generated:")
            print(initial_biography)
            
            # Uncomment the below two if shorter output is necessary
            # print("Extending biography...")
            # extended_biography = summarizer.extend_biography(initial_biography, transcript_data)
            # print("Biography extended:")
            # print(extended_biography)

            # print("Refining biography...")
            # refined_biography = summarizer.refine_biography_to_500_words(extended_biography, transcript_data)
            # print("Biography refined.")
            # print(refined_biography)
            
            output_pdf_path = os.path.join(output_directory, file_name.replace('.csv', '.pdf'))
            save_text_to_pdf(initial_biography.strip(), output_pdf_path)
            print(f"Processed and saved {file_name} as PDF.")
            
            summarizer.clear_model()
        else:
            print(f"Failed to read CSV file: {file_name}")

In [None]:
# Profiling block to analyze performance
import cProfile
import pstats
import io

if __name__ == "__main__":
    directory_path = "C:/Users/asha4/OneDrive - SRH/Case Study-1/Dennis- Files/WG_ [EXTERN]  Transcripts and Biographies/"  #provide the path to the directory containing the CSV files

    profiler = cProfile.Profile()
    profiler.enable()

    process_all_files_in_directory(directory_path)

    profiler.disable()
    stream = io.StringIO()
    stats = pstats.Stats(profiler, stream=stream).sort_stats('cumulative')
    stats.print_stats()
    print(stream.getvalue())