In [None]:
# %pip install pandas
# %pip install nltk
# %pip install langchain-together
# %pip install fpdf

In [2]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from langchain_together import Together
import time
import os
from fpdf import FPDF

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asha4\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def extract_rows_with_sprecher(df, sprecher_prefix):
    df = df.dropna(subset=['Sprecher'])
    filtered_rows = df[df['Sprecher'].str.startswith(sprecher_prefix)]
    transkript_list = filtered_rows['Transkript'].tolist()
    return transkript_list

def transkript_to_string(transkript_list):
    return "\n".join(transkript_list)

In [None]:
def divide_into_chunks(text, max_words_per_chunk):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_word_count = 0

    for sentence in sentences:
        words_in_sentence = len(word_tokenize(sentence))
        if current_word_count + words_in_sentence > max_words_per_chunk:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_word_count = words_in_sentence
        else:
            current_chunk.append(sentence)
            current_word_count += words_in_sentence

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

In [3]:
class Summarizer:
    def __init__(self):
        self.llm = Together(
            model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
            temperature=0.1,
            max_tokens= 1024,
            top_k=1,
            together_api_key="3547c556949e8b9beea25d84c997eebd60a491f60e33548b59a51f86f313a277"
        )

#160be45af627da8e0d12935e59de9db8648a6327ba2a81a563bacb39750a052e

    def invoke_with_retry(self, full_input, retries=3, retry_delay=30):
        for attempt in range(retries):
            try:
                output_summary = self.llm.invoke(full_input)
                return output_summary
            except Exception as e:
                if "524" in str(e) and attempt < retries - 1:
                    print(f"Error 524: Retrying in {retry_delay} seconds... (Attempt {attempt + 1}/{retries})")
                    time.sleep(retry_delay)
                    retries += 1

                else:
                    raise

    def generate_biography(self, input_text):
        self.llm.temperature = 0.1
        self.llm.max_tokens = 1024
        self.llm.top_k = 1
        
        prompt = """
        Du bist ein deutsches Textzusammenfassungsmodell. Erstellen Sie eine prägnante Zusammenfassung des obigen Textes in deutscher Sprache innerhalb von 500 Wörtern. Konzentrieren Sie sich auf die wichtigsten Punkte und bewahren Sie Klarheit. Work only with the data given and do not provide your conclusions or interpretations of the biography or the data provided. Work only with the data.
        """
        
        chunks = divide_into_chunks(input_text, max_words_per_chunk = 70000)
        biography_parts = []
        
        for chunk in chunks:
            full_input = chunk + prompt
            output_summary = self.invoke_with_retry(full_input)
            biography_parts.append(output_summary.strip())

        full_biography = " ".join(biography_parts)
        return full_biography

    def extend_biography(self, partial_biography, input_text):
        self.llm.temperature = 0.1
        self.llm.max_tokens = 800 
        self.llm.top_k = 1
        
        prompt = f"""
        Hier ist der erste Teil der Biografie: {partial_biography}
        Nun sehen Sie sich die Daten erneut an und ergänzen Sie die Biografie um die fehlenden Informationen.
        """

        chunks = divide_into_chunks(input_text, max_words_per_chunk=70000)
        extended_biography_parts = []
        
        for chunk in chunks:
            full_input = chunk + prompt
            output_summary = self.invoke_with_retry(full_input)
            extended_biography_parts.append(output_summary.strip())

        full_extended_biography = " ".join(extended_biography_parts)
        return self.remove_incomplete_sentence(full_extended_biography)

    def refine_biography_to_500_words(self, extended_biography, input_text):
        self.llm.temperature = 0.1
        self.llm.max_tokens = 512
        self.llm.top_k = 1

        prompt = f"""
Hier ist der erste Teil der Biografie: {extended_biography}
        Nun sehen Sie sich die Daten erneut an und ergänzen Sie die Biografie um die fehlenden Informationen.
                """
        
        chunks = divide_into_chunks(input_text, max_words_per_chunk=70000)
        refined_parts = []

        for chunk in chunks:
            full_input = chunk + prompt
            output_summary = self.invoke_with_retry(full_input)
            refined_parts.append(output_summary.strip())

        full_refined_biography = " ".join(refined_parts)
        return self.remove_incomplete_sentence(full_refined_biography)

    def remove_incomplete_sentence(self, biography):
        words = word_tokenize(biography)
        if len(words) <= 500:
            return biography
        
        truncated_words = words[:500]
        truncated_text = " ".join(truncated_words)
        last_full_stop_index = truncated_text.rfind('.')
        
        if last_full_stop_index != -1:
            return truncated_text[:last_full_stop_index + 1]
        else:
            return truncated_text

In [None]:
def read_csv(file_path):
    try:
        df = pd.read_csv(file_path, sep='\t')
        return df
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None


def process_file(file_path):
    summarizer = Summarizer()
    
    if file_path.endswith('.csv'):
        df = read_csv(file_path)
        if df is not None:
            sprecher_prefix = 'IP_'
            transkript_list = extract_rows_with_sprecher(df, sprecher_prefix)
            transcript_data = transkript_to_string(transkript_list)
        else:
            print("Failed to read CSV file.")
            return
    else:
        print("Unsupported file format.")
        return

In [None]:
#     # Step 1: Generate the initial biography with chunking
#     initial_biography = summarizer.generate_biography(transcript_data)
#     print("Initial Biography : \n", initial_biography.strip())

#     # Step 2: Extend the biography with additional details, also using chunking
#     extended_biography = summarizer.extend_biography(initial_biography, transcript_data)
#     print("Extended Biography : \n", extended_biography.strip())

#     # Step 3: Refine the biography to 500 words and remove incomplete sentences
#     refined_biography = summarizer.refine_biography_to_500_words(extended_biography, transcript_data)
#     print("Refined Biography (500 words) : \n", refined_biography.strip())

# if __name__ == "__main__":
#     file_path = "C:/Users/asha4/OneDrive - SRH/Case Study-1/Dennis- Files/WG_ [EXTERN]  Transcripts and Biographies/adg0001_er_2024_04_23.csv"
#     process_file(file_path)

In [4]:
def save_text_to_pdf(text, pdf_path):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    
    # Add text to the PDF
    for line in text.split('\n'):
        pdf.multi_cell(0, 10, line)
    
    # Save the PDF
    pdf.output(pdf_path)

def process_all_files_in_directory(directory_path):
    summarizer = Summarizer()
    output_directory = os.path.join(directory_path, "output_pdfs")
    
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    # Iterate over all CSV files in the directory
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.csv'):
            file_path = os.path.join(directory_path, file_name)
            df = read_csv(file_path)
            
            if df is not None:
                sprecher_prefix = 'IP_'
                transkript_list = extract_rows_with_sprecher(df, sprecher_prefix)
                transcript_data = transkript_to_string(transkript_list)
                
                # Step 1: Generate the initial biography with chunking
                initial_biography = summarizer.generate_biography(transcript_data)
                
                # Step 2: Extend the biography with additional details, also using chunking
                extended_biography = summarizer.extend_biography(initial_biography, transcript_data)
                
                # Step 3: Refine the biography to 500 words and remove incomplete sentences
                refined_biography = summarizer.refine_biography_to_500_words(extended_biography, transcript_data)
                
                # Save the refined biography as a PDF
                output_pdf_path = os.path.join(output_directory, file_name.replace('.csv', '.pdf'))
                save_text_to_pdf(refined_biography.strip(), output_pdf_path)
                print(f"Processed and saved {file_name} as PDF.")
            else:
                print(f"Failed to read CSV file: {file_name}")
        else:
            print(f"Unsupported file format: {file_name}")

if __name__ == "__main__":
    directory_path = "C:/Users/asha4/OneDrive - SRH/Case Study-1/Dennis- Files/WG_ [EXTERN]  Transcripts and Biographies/"
    process_all_files_in_directory(directory_path)

Unsupported file format: ADG0001 Bio.txt
Processed and saved adg0001_er_2024_04_23.csv as PDF.
Unsupported file format: ADG0002 Bio.txt
Processed and saved adg0002_er_2024_04_23.csv as PDF.
Unsupported file format: Output
Unsupported file format: output_pdfs
