In [None]:
# %pip install pandas
# %pip install nltk
# %pip install fpdf
# %pip install transformers==4.43.1
# %pip install vllm==0.5.3.post1
# %pip install torch

In [None]:
# # Load model directly
# from transformers import AutoTokenizer, AutoModelForCausalLM

# token = "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"
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=token)
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=token)

In [None]:
import os
import tempfile
import gc
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from nltk.tokenize import word_tokenize, sent_tokenize
import shutil
import pandas as pd
from fpdf import FPDF
import nltk

nltk.download('punkt_tab')

In [None]:
if torch.cuda.is_available():
    print("CUDA is available! Using GPU.")
    device = torch.device("cuda")
else:
    print("CUDA is not available. Using CPU.")
    device = torch.device("cpu")

In [None]:
def extract_rows_with_sprecher(df, sprecher_prefix):
    df = df.dropna(subset=['Sprecher'])
    filtered_rows = df[df['Sprecher'].str.startswith(sprecher_prefix)]
    transkript_list = filtered_rows['Transkript'].tolist()
    return transkript_list

def transkript_to_string(transkript_list):
    return "\n".join(transkript_list)

In [None]:
def divide_into_chunks(text, max_words_per_chunk):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_word_count = 0

    for sentence in sentences:
        words_in_sentence = len(word_tokenize(sentence))
        if current_word_count + words_in_sentence > max_words_per_chunk:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_word_count = words_in_sentence
        else:
            current_chunk.append(sentence)
            current_word_count += words_in_sentence

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

#Use this if the above chunking generates a lot of load on memory

# def divide_into_chunks(text, max_words_per_chunk):
#     words = word_tokenize(text)
#     return [' '.join(words[i:i + max_words_per_chunk]) for i in range(0, len(words), max_words_per_chunk)]

In [None]:
# class Summarizer:
#     def __init__(self):
#         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#         self.tokenizer = None
#         self.model = None
#         self.temp_dir = tempfile.mkdtemp()  # Initialize temporary directory
#         self.load_model()  # Ensure the model and tokenizer are loaded upon initialization
        

#     def load_model(self):
#         gc.collect()  # Clear CPU memory

#         # Ensure tokenizer and model are loaded only if not already set
#         if self.tokenizer is None or self.model is None:
#             print("Loading model...")
#             token = "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"
#             try:
#                 self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", token=token)
#                 self.model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", token=token)
#                 self.model = self.model.to(self.device)  # Move the model to the appropriate device (CPU or GPU)
#                 print("Model and tokenizer successfully loaded.")
#             except Exception as e:
#                 print(f"Error loading model: {e}")
#                 raise
#         else:
#             print("Model already loaded and in use.")


#     def forward_pass(self, input_ids):
#         """Performs a forward pass using the model with the given input IDs."""
#         with torch.no_grad():
#             outputs = self.model.generate(
#                 input_ids=input_ids,
#                 max_length=self.max_tokens,
#                 temperature=self.temperature,
#                 top_k=self.top_k,
#                 num_return_sequences=1
#             )
#         return outputs


#     def process_chunk(self, chunk, chunk_id):
#         # Ensure the model is loaded
#         self.load_model()

#         input_ids = self.tokenizer(chunk, return_tensors="pt").input_ids.to(self.device)

#         # Use `max_new_tokens` instead of `max_length` to limit the generation
#         with torch.no_grad():
#             outputs = self.model.generate(
#                 input_ids=input_ids,
#                 max_new_tokens=self.max_tokens,  # Control the length of the generated output
#                 temperature=self.temperature,
#                 top_k=self.top_k,
#                 num_return_sequences=1
#             )

#         generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

#         output_file = os.path.join(self.temp_dir, f"chunk_{chunk_id}.txt")
#         with open(output_file, 'w') as f:
#             f.write(generated_text)
#         return output_file


#     def process_all_chunks(self, chunk_prompts):
#         """Processes all chunks by invoking process_chunk for each one with retry logic."""
#         outputs = []
#         for idx, prompt in enumerate(chunk_prompts):
#             output = self.invoke_with_retry(self.process_chunk, prompt, chunk_id=idx)
#             outputs.append(output)
#         return outputs



#     def clear_model(self):
#         print("Clearing model from memory...")
#         del self.model
#         del self.tokenizer
#         torch.cuda.empty_cache()  # Clear CUDA memory if used
#         gc.collect()  # Force garbage collection to free memory

#     def invoke_with_retry(self, func, *args, retries=3, **kwargs):
#         for attempt in range(retries):
#             try:
#                 return func(*args, **kwargs)
#             except Exception as e:
#                 print(f"Attempt {attempt + 1} failed: {e}")
#                 if attempt == retries - 1:
#                     raise  # Re-raise the exception if the last attempt fails



#     def generate_biography(self, input_text):
#         # Ensure the model is loaded
#         self.load_model()
        
#         # Set specific parameters for this method
#         self.temperature = 0.1
#         self.max_tokens = 1024
#         self.top_k = 1
        
#         prompt = """
#         Du bist ein deutsches Textzusammenfassungsmodell. Erstellen Sie eine prägnante Zusammenfassung des obigen Textes in deutscher Sprache innerhalb von 500 Wörtern. Konzentrieren Sie sich auf die wichtigsten Punkte und bewahren Sie Klarheit. Work only with the data given and do not provide your conclusions or interpretations of the biography or the data provided. Work only with the data.
#         """
        
#         chunks = divide_into_chunks(input_text, max_words_per_chunk=70000)
#         output_files = self.process_all_chunks([chunk + prompt for chunk in chunks])
        
#         full_biography = assemble_biography_from_disk(output_files)
#         # self.clear_model()
#         return full_biography



#     def extend_biography(self, partial_biography, input_text):
#         try:
#             # Ensure the model is loaded
#             self.load_model()

#             # Set specific parameters for this method
#             self.temperature = 0.1
#             self.max_tokens = 800
#             self.top_k = 1

#             # Construct the prompt
#             prompt = f"""
#             Hier ist der erste Teil der Biografie: {partial_biography}
#             Nun sehen Sie sich die Daten erneut an und ergänzen Sie die Biografie um die fehlenden Informationen.
#             """

#             # Divide the input text into manageable chunks
#             chunks = divide_into_chunks(input_text, max_words_per_chunk=25000)

#             # Process all chunks and generate the extended biography
#             output_files = self.process_all_chunks([chunk + prompt for chunk in chunks])

#             # Assemble the full extended biography from the output files
#             full_extended_biography = assemble_biography_from_disk(output_files)

#             # Remove any incomplete sentences from the final output
#             return self.remove_incomplete_sentence(full_extended_biography)
        
#         except Exception as e:
#             print(f"An error occurred while extending the biography: {str(e)}")
#             return None
        
#         # finally:
#         #     # Clear the model from memory to free resources
#         #     self.clear_model()



#     def refine_biography_to_500_words(self, extended_biography, input_text):
#         try:
#             # Ensure the model is loaded
#             self.load_model()

#             # Set specific parameters for this method
#             self.temperature = 0.1
#             self.max_tokens = 512
#             self.top_k = 1

#             # Construct the prompt
#             prompt = f"""
#             Hier ist der erste Teil der Biografie: {extended_biography}
#             Nun sehen Sie sich die Daten erneut an und ergänzen Sie die Biografie um die fehlenden Informationen.
#             """

#             # Divide the input text into manageable chunks
#             chunks = divide_into_chunks(input_text, max_words_per_chunk=25000)

#             # Process all chunks and generate the refined biography
#             output_files = self.process_all_chunks([chunk + prompt for chunk in chunks])

#             # Assemble the full refined biography from the output files
#             full_refined_biography = assemble_biography_from_disk(output_files)

#             # Remove any incomplete sentences from the final output
#             return self.remove_incomplete_sentence(full_refined_biography)
        
#         except Exception as e:
#             print(f"An error occurred while refining the biography: {str(e)}")
#             return None
        
#         finally:
#             # Clear the model from memory to free resources
#             self.clear_model()



#     def remove_incomplete_sentence(self, biography):
#         words = nltk.word_tokenize(biography)
#         if len(words) <= 800:
#             return biography
        
#         truncated_words = words[:800]
#         truncated_text = " ".join(truncated_words)
#         last_full_stop_index = truncated_text.rfind('.')
        
#         if last_full_stop_index != -1:
#             return truncated_text[:last_full_stop_index + 1]
#         else:
#             return truncated_text

In [None]:
class Summarizer:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = None
        self.model = None
        self.temp_dir = tempfile.mkdtemp()  # Initialize temporary directory
        self.load_model_once = False  # Flag to check if the model has been loaded

    def load_model(self):
        gc.collect()  # Clear CPU memory

        if not self.load_model_once:
            print("Loading model...")
            token = "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"
            try:
                self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", token=token)
                self.model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", token=token)
                self.model = self.model.to(self.device)  # Move the model to the appropriate device (CPU or GPU)
                print("Model and tokenizer successfully loaded.")
                self.load_model_once = True  # Set the flag to true after loading the model
            except Exception as e:
                print(f"Error loading model: {e}")
                raise
        else:
            print("Model already loaded and in use.")

    def forward_pass(self, input_ids):
        """Performs a forward pass using the model with the given input IDs."""
        with torch.no_grad():
            outputs = self.model.generate(
                input_ids=input_ids,
                max_length=self.max_tokens,
                temperature=self.temperature,
                top_k=self.top_k,
                num_return_sequences=1
            )
        return outputs

    def process_chunk(self, chunk, chunk_id):
        self.load_model()
        input_ids = self.tokenizer(chunk, return_tensors="pt").input_ids.to(self.device)

        with torch.no_grad():
            outputs = self.model.generate(
                input_ids=input_ids,
                max_new_tokens=self.max_tokens,
                temperature=self.temperature,
                top_k=self.top_k,
                num_return_sequences=1
            )

        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Clear memory after processing each chunk
        del input_ids, outputs
        gc.collect()

        return generated_text

    def process_all_chunks(self, chunk_prompts):
        """Processes all chunks by invoking process_chunk for each one with retry logic."""
        outputs = []
        for idx, prompt in enumerate(chunk_prompts):
            output = self.invoke_with_retry(self.process_chunk, prompt, chunk_id=idx)
            outputs.append(output)
        return " ".join(outputs)

    def clear_model(self):
        print("Clearing model from memory...")
        del self.model
        del self.tokenizer
        torch.cuda.empty_cache()  # Clear CUDA memory if used
        gc.collect()  # Force garbage collection to free memory

    def invoke_with_retry(self, func, *args, retries=3, **kwargs):
        for attempt in range(retries):
            try:
                return func(*args, **kwargs)
            except Exception as e:
                print(f"Attempt {attempt + 1} failed: {e}")
                if attempt == retries - 1:
                    raise  # Re-raise the exception if the last attempt fails

    def generate_biography(self, input_text):
        self.load_model()
        
        self.temperature = 0.1
        self.max_tokens = 512  # Lowered from 1024
        self.top_k = 1
        
        prompt = """
        Du bist ein deutsches Textzusammenfassungsmodell. Erstellen Sie eine prägnante Zusammenfassung des obigen Textes in deutscher Sprache innerhalb von 500 Wörtern. Konzentrieren Sie sich auf die wichtigsten Punkte und bewahren Sie Klarheit. Work only with the data given and do not provide your conclusions or interpretations of the biography or the data provided. Work only with the data.
        """
        
        chunks = divide_into_chunks(input_text, max_words_per_chunk=10000)
        output_biography = self.process_all_chunks([chunk + prompt for chunk in chunks])
        
        return output_biography

    def extend_biography(self, partial_biography, input_text):
        try:
            self.load_model()

            self.temperature = 0.1
            self.max_tokens = 800
            self.top_k = 1

            prompt = f"""
            Hier ist der erste Teil der Biografie: {partial_biography}
            Nun sehen Sie sich die Daten erneut an und ergänzen Sie die Biografie um die fehlenden Informationen.
            """

            chunks = divide_into_chunks(input_text, max_words_per_chunk=10000)

            extended_biography = self.process_all_chunks([chunk + prompt for chunk in chunks])

            return self.remove_incomplete_sentence(extended_biography)
        
        except Exception as e:
            print(f"An error occurred while extending the biography: {str(e)}")
            return None

    def refine_biography_to_500_words(self, extended_biography, input_text):
        try:
            self.load_model()

            self.temperature = 0.1
            self.max_tokens = 512
            self.top_k = 1

            prompt = f"""
            Hier ist der erste Teil der Biografie: {extended_biography}
            Nun sehen Sie sich die Daten erneut an und ergänzen Sie die Biografie um die fehlenden Informationen.
            """

            chunks = divide_into_chunks(input_text, max_words_per_chunk=10000)

            refined_biography = self.process_all_chunks([chunk + prompt for chunk in chunks])

            return self.remove_incomplete_sentence(refined_biography)
        
        except Exception as e:
            print(f"An error occurred while refining the biography: {str(e)}")
            return None
        
        finally:
            self.clear_model()

    def remove_incomplete_sentence(self, biography):
        words = nltk.word_tokenize(biography)
        if len(words) <= 800:
            return biography
        
        truncated_words = words[:800]
        truncated_text = " ".join(truncated_words)
        last_full_stop_index = truncated_text.rfind('.')
        
        if last_full_stop_index != -1:
            return truncated_text[:last_full_stop_index + 1]
        else:
            return truncated_text

In [None]:
def assemble_biography_from_disk(output_files):
    full_biography = []
    for output_file in output_files:
        with open(output_file, 'r') as f:
            full_biography.append(f.read().strip())

    return " ".join(full_biography)


def cleanup_temp_files(temp_dir):
    shutil.rmtree(temp_dir)  # Remove the temporary directory and its contents


def read_csv(file_path):
    try:
        df = pd.read_csv(file_path, sep='\t')
        return df
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None
        

def save_text_to_pdf(text, pdf_path):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    
    for line in text.split('\n'):
        pdf.multi_cell(0, 10, line)
    
    pdf.output(pdf_path)

In [None]:
# def process_file(file_path):
#     summarizer = Summarizer()
    
#     if file_path.endswith('.csv'):
#         df = read_csv(file_path)
#         if df is not None:
#             sprecher_prefix = 'IP_'
#             transkript_list = extract_rows_with_sprecher(df, sprecher_prefix)
#             transcript_data = transkript_to_string(transkript_list)
#         else:
#             print("Failed to read CSV file.")
#             return
#     else:
#         print("Unsupported file format.")
#         return

In [None]:
def process_all_files_in_directory(directory_path):
    summarizer = Summarizer()
    output_directory = os.path.join(directory_path, "output_pdfs")
    
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    for file_name in os.listdir(directory_path):
        if not file_name.endswith('.csv'):
            print(f"Unsupported file format: {file_name}")
            continue
        
        file_path = os.path.join(directory_path, file_name)
        print(f"Processing file: {file_name}")
        
        df = read_csv(file_path)
        
        if df is not None:
            sprecher_prefix = 'IP_'
            transkript_list = extract_rows_with_sprecher(df, sprecher_prefix)
            transcript_data = transkript_to_string(transkript_list)
            
            print("Generating initial biography...")
            initial_biography = summarizer.generate_biography(transcript_data)
            print("Initial biography generated:")
            print(initial_biography)
            
            print("Extending biography...")
            extended_biography = summarizer.extend_biography(initial_biography, transcript_data)
            print("Biography extended:")
            print(extended_biography)

            print("Refining biography...")
            refined_biography = summarizer.refine_biography_to_500_words(extended_biography, transcript_data)
            print("Biography refined.")
            print(refined_biography)
            
            output_pdf_path = os.path.join(output_directory, file_name.replace('.csv', '.pdf'))
            save_text_to_pdf(refined_biography.strip(), output_pdf_path)
            print(f"Processed and saved {file_name} as PDF.")
            
            summarizer.clear_model()
        else:
            print(f"Failed to read CSV file: {file_name}")

In [None]:
# Profiling block to analyze performance
import cProfile
import pstats
import io

if __name__ == "__main__":
    directory_path = "C:/Users/asha4/OneDrive - SRH/Case Study-1/Dennis- Files/WG_ [EXTERN]  Transcripts and Biographies/"

    profiler = cProfile.Profile()
    profiler.enable()

    process_all_files_in_directory(directory_path)

    profiler.disable()
    stream = io.StringIO()
    stats = pstats.Stats(profiler, stream=stream).sort_stats('cumulative')
    stats.print_stats()
    print(stream.getvalue())