In [1]:
# %pip install pandas
# %pip install nltk
# %pip install langchain-together
# %pip install fpdf

In [2]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from langchain_together import Together
import time
import os
from fpdf import FPDF

nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\EDWIN
[nltk_data]     SAMUEL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
def extract_rows_with_sprecher(df, sprecher_prefix):
    df = df.dropna(subset=['Sprecher'])
    filtered_rows = df[df['Sprecher'].str.startswith(sprecher_prefix)]
    transkript_list = filtered_rows['Transkript'].tolist()
    return transkript_list

def transkript_to_string(transkript_list):
    return "\n".join(transkript_list)

In [4]:
def divide_into_chunks(text, max_words_per_chunk):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_word_count = 0

    for sentence in sentences:
        words_in_sentence = len(word_tokenize(sentence))
        if current_word_count + words_in_sentence > max_words_per_chunk:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_word_count = words_in_sentence
        else:
            current_chunk.append(sentence)
            current_word_count += words_in_sentence

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

In [5]:
class Summarizer:
    def __init__(self):
        self.llm = Together(
            model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
            temperature=0.1,
            max_tokens= 1024,
            top_k=1,
            together_api_key="0c7ca1d8efd796b972cf1a5593343213bd4ab46451a2cecbcac333fa6f02793f"
        )


    def invoke_with_retry(self, full_input, retries=3, retry_delay=30):
        for attempt in range(retries):
            try:
                output_summary = self.llm.invoke(full_input)
                return output_summary
            except Exception as e:
                if "524" in str(e) and attempt < retries - 1:
                    print(f"Error 524: Retrying in {retry_delay} seconds... (Attempt {attempt + 1}/{retries})")
                    time.sleep(retry_delay)
                    retries += 1

                else:
                    raise

    def generate_biography(self, input_text):
        self.llm.temperature = 0.1
        self.llm.max_tokens = 1024
        self.llm.top_k = 1
        
        prompt = """

        I would like you to generate a detailed biography of an interviewee based on the following structured questions in German language. Please address each question thoroughly, ensuring that the narrative flows smoothly from one life stage to the next. The biography should include the following information:

Birth and Early Family Life:

When and where was the interviewee born? Include the date and location of birth.
Who are the interviewee's parents? Provide their names, backgrounds, and any relevant details about their lives.
Does the interviewee have any siblings? If so, provide details about them, including names and relationships.
Education:

Which school or schools did the interviewee attend? Mention the date, names of the institutions, locations, and any significant experiences or achievements during their education.
Career and Professional Life:

What profession did the interviewee learn or train for? Mention date and describe the nature of their training or education in this field.
Which jobs or professions has the interviewee practiced? Include details about the dates, roles, companies, or organizations they worked for, and any significant milestones or achievements in their career.
Life Events and Personal Milestones:

What were the formative or significant life events in the interviewee's childhood? Mention dates, Include any experiences that had a lasting impact.
What were the formative or significant life events during the interviewee's adolescence?  Mention dates, Describe how these events influenced their path in life.
What were the formative or significant life events in the interviewee's early adult years? Include details about any dates, transitions, challenges, or accomplishments during this period.
What were the formative or significant life events during the interviewee's adult years?  Mention dates, Describe key experiences that shaped their personal or professional life.
What were the formative or significant life events in the interviewee's late adult years? Highlight any dates, major changes, achievements, or reflections during this time.
Personal Life:

Did the interviewee marry? If yes, provide details about their spouse,dates, including the name and any significant information about their relationship.
Does the interviewee have children? If so, provide details about their children, dates, including names and any significant life events related to them.
Significant Life Events:

What are the most significant life events that have shaped the interviewee's life overall? Reflect on how these events impacted their personal growth, relationships, or career, dates.
Please ensure the biography is coherent, detailed, and presents a well-rounded view of the interviewee's life journey. Mention all dates - day month and year, don't exclude any. I repreat, mention all dates.       
        """
        
        chunks = divide_into_chunks(input_text, max_words_per_chunk = 70000)
        biography_parts = []
        
        for chunk in chunks:
            full_input = chunk + prompt
            output_summary = self.invoke_with_retry(full_input)
            biography_parts.append(output_summary.strip())

        full_biography = " ".join(biography_parts)
        return full_biography

    def extend_biography(self, partial_biography, input_text):
        self.llm.temperature = 0.1
        self.llm.max_tokens = 800 
        self.llm.top_k = 1
        
        prompt = f"""
        Hier ist der erste Teil der Biografie: {partial_biography}
        I would like you to generate a detailed biography of an interviewee based on the following structured questions in German language. Please address each question thoroughly, ensuring that the narrative flows smoothly from one life stage to the next. The biography should include the following information:

Birth and Early Family Life:

When and where was the interviewee born? Include the date and location of birth.
Who are the interviewee's parents? Provide their names, backgrounds, and any relevant details about their lives.
Does the interviewee have any siblings? If so, provide details about them, including names and relationships.
Education:

Which school or schools did the interviewee attend? Mention the date, names of the institutions, locations, and any significant experiences or achievements during their education.
Career and Professional Life:

What profession did the interviewee learn or train for? Mention date and describe the nature of their training or education in this field.
Which jobs or professions has the interviewee practiced? Include details about the dates, roles, companies, or organizations they worked for, and any significant milestones or achievements in their career.
Life Events and Personal Milestones:

What were the formative or significant life events in the interviewee's childhood? Mention dates, Include any experiences that had a lasting impact.
What were the formative or significant life events during the interviewee's adolescence?  Mention dates, Describe how these events influenced their path in life.
What were the formative or significant life events in the interviewee's early adult years? Include details about any dates, transitions, challenges, or accomplishments during this period.
What were the formative or significant life events during the interviewee's adult years?  Mention dates, Describe key experiences that shaped their personal or professional life.
What were the formative or significant life events in the interviewee's late adult years? Highlight any dates, major changes, achievements, or reflections during this time.
Personal Life:

Did the interviewee marry? If yes, provide details about their spouse,dates, including the name and any significant information about their relationship.
Does the interviewee have children? If so, provide details about their children, dates, including names and any significant life events related to them.
Significant Life Events:

What are the most significant life events that have shaped the interviewee's life overall? Reflect on how these events impacted their personal growth, relationships, or career, dates.
Please ensure the biography is coherent, detailed, and presents a well-rounded view of the interviewee's life journey. Mention all dates - day month and year, don't exclude any. I repreat, mention all dates.
        """

        chunks = divide_into_chunks(input_text, max_words_per_chunk=70000)
        extended_biography_parts = []
        
        for chunk in chunks:
            full_input = chunk + prompt
            output_summary = self.invoke_with_retry(full_input)
            extended_biography_parts.append(output_summary.strip())

        full_extended_biography = " ".join(extended_biography_parts)
        return self.remove_incomplete_sentence(full_extended_biography)

    def refine_biography_to_500_words(self, extended_biography, input_text):
        self.llm.temperature = 0.1
        self.llm.max_tokens = 512
        self.llm.top_k = 1

        prompt = f"""
Hier ist der erste Teil der Biografie: {extended_biography}
        I would like you to generate a detailed biography of an interviewee based on the following structured questions in German language. Please address each question thoroughly, ensuring that the narrative flows smoothly from one life stage to the next. The biography should include the following information:

Birth and Early Family Life:

When and where was the interviewee born? Include the date and location of birth.
Who are the interviewee's parents? Provide their names, backgrounds, and any relevant details about their lives.
Does the interviewee have any siblings? If so, provide details about them, including names and relationships.
Education:

Which school or schools did the interviewee attend? Mention the date, names of the institutions, locations, and any significant experiences or achievements during their education.
Career and Professional Life:

What profession did the interviewee learn or train for? Mention date and describe the nature of their training or education in this field.
Which jobs or professions has the interviewee practiced? Include details about the dates, roles, companies, or organizations they worked for, and any significant milestones or achievements in their career.
Life Events and Personal Milestones:

What were the formative or significant life events in the interviewee's childhood? Mention dates, Include any experiences that had a lasting impact.
What were the formative or significant life events during the interviewee's adolescence?  Mention dates, Describe how these events influenced their path in life.
What were the formative or significant life events in the interviewee's early adult years? Include details about any dates, transitions, challenges, or accomplishments during this period.
What were the formative or significant life events during the interviewee's adult years?  Mention dates, Describe key experiences that shaped their personal or professional life.
What were the formative or significant life events in the interviewee's late adult years? Highlight any dates, major changes, achievements, or reflections during this time.
Personal Life:

Did the interviewee marry? If yes, provide details about their spouse,dates, including the name and any significant information about their relationship.
Does the interviewee have children? If so, provide details about their children, dates, including names and any significant life events related to them.
Significant Life Events:

What are the most significant life events that have shaped the interviewee's life overall? Reflect on how these events impacted their personal growth, relationships, or career, dates.
Please ensure the biography is coherent, detailed, and presents a well-rounded view of the interviewee's life journey. Mention all dates - day month and year, don't exclude any. I repreat, mention all dates.
                """
        
        chunks = divide_into_chunks(input_text, max_words_per_chunk=70000)
        refined_parts = []

        for chunk in chunks:
            full_input = chunk + prompt
            output_summary = self.invoke_with_retry(full_input)
            refined_parts.append(output_summary.strip())

        full_refined_biography = " ".join(refined_parts)
        return self.remove_incomplete_sentence(full_refined_biography)

    def remove_incomplete_sentence(self, biography):
        words = word_tokenize(biography)
        if len(words) <= 500:
            return biography
        
        truncated_words = words[:500]
        truncated_text = " ".join(truncated_words)
        last_full_stop_index = truncated_text.rfind('.')
        
        if last_full_stop_index != -1:
            return truncated_text[:last_full_stop_index + 1]
        else:
            return truncated_text

In [6]:
def read_csv(file_path):
    try:
        df = pd.read_csv(file_path, sep='\t')
        return df
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None


def process_file(file_path):
    summarizer = Summarizer()
    
    if file_path.endswith('.csv'):
        df = read_csv(file_path)
        if df is not None:
            # Added to handle both 'IP_' and 'INT_' prefixes
            transkript_list_ip = extract_rows_with_sprecher(df, 'IP_')  # Process rows with 'IP_' prefix
            transkript_list_int = extract_rows_with_sprecher(df, 'INT_')  # Process rows with 'INT_' prefix
        
            # Combine both transcript lists into one
            combined_transkript_list = transkript_list_ip + transkript_list_int
            transcript_data = transkript_to_string(combined_transkript_list)  # Convert combined list to string
        else:
            print("Failed to read CSV file.")
            return
    else:
        print("Unsupported file format.")
        return

In [7]:
#     # Step 1: Generate the initial biography with chunking
#     initial_biography = summarizer.generate_biography(transcript_data)
#     print("Initial Biography : \n", initial_biography.strip())

#     # Step 2: Extend the biography with additional details, also using chunking
#     extended_biography = summarizer.extend_biography(initial_biography, transcript_data)
#     print("Extended Biography : \n", extended_biography.strip())

#     # Step 3: Refine the biography to 500 words and remove incomplete sentences
#     refined_biography = summarizer.refine_biography_to_500_words(extended_biography, transcript_data)
#     print("Refined Biography (500 words) : \n", refined_biography.strip())

# if __name__ == "__main__":
#     file_path = "C:/Users/asha4/OneDrive - SRH/Case Study-1/Dennis- Files/WG_ [EXTERN]  Transcripts and Biographies/adg0001_er_2024_04_23.csv"
#     process_file(file_path)

In [8]:
def save_text_to_pdf(text, pdf_path):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    
    # Add text to the PDF
    for line in text.split('\n'):
        pdf.multi_cell(0, 10, line)
    
    # Save the PDF
    pdf.output(pdf_path)

def process_all_files_in_directory(directory_path):
    summarizer = Summarizer()
    output_directory = os.path.join(directory_path, "output_pdfs")
    
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    # Iterate over all CSV files in the directory
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.csv'):
            file_path = os.path.join(directory_path, file_name)
            df = read_csv(file_path)
            
            if df is not None:
                # Added to handle both 'IP_' and 'INT_' prefixes
                transkript_list_ip = extract_rows_with_sprecher(df, 'IP_')  # Process rows with 'IP_' prefix
                transkript_list_int = extract_rows_with_sprecher(df, 'INT_')  # Process rows with 'INT_' prefix
        
                # Combine both transcript lists into one
                combined_transkript_list = transkript_list_ip + transkript_list_int
                transcript_data = transkript_to_string(combined_transkript_list)
                
                # Step 1: Generate the initial biography with chunking
                initial_biography = summarizer.generate_biography(transcript_data)
                
                # Step 2: Extend the biography with additional details, also using chunking
                #extended_biography = summarizer.extend_biography(initial_biography, transcript_data)
                
                # Step 3: Refine the biography to 500 words and remove incomplete sentences
                #refined_biography = summarizer.refine_biography_to_500_words(extended_biography, transcript_data)
                
                # Save the refined biography as a PDF
                output_pdf_path = os.path.join(output_directory, file_name.replace('.csv', '.pdf'))
                save_text_to_pdf(initial_biography.strip(), output_pdf_path)
                print(f"Processed and saved {file_name} as PDF.")
            else:
                print(f"Failed to read CSV file: {file_name}")
        else:
            print(f"Unsupported file format: {file_name}")

if __name__ == "__main__":
    directory_path = "C:/Users/EDWIN SAMUEL/OneDrive/Desktop/Pipeline14/Final_Code/Flask_Python/WG_ [EXTERN]  Transcripts and Biographies/"
    #directory_path = "C:/Users/asha4/OneDrive - SRH/Case Study-1/Dennis- Files/WG_ [EXTERN]  Transcripts and Biographies/"
    process_all_files_in_directory(directory_path)

Error reading CSV file: [Errno 13] Permission denied: 'C:/Users/EDWIN SAMUEL/OneDrive/Desktop/Pipeline14/WG_ [EXTERN]  Transcripts and Biographies/adg0001_er_2024_04_23.csv'
Failed to read CSV file: adg0001_er_2024_04_23.csv
Unsupported file format: output_pdfs
