Restart the terminal after running the two blocks below

In [1]:
# %pip install pandas
# %pip install nltk
# %pip install fpdf
# %pip install ipywidgets
# %pip install transformers==4.43.1
# %pip install vllm==0.5.3.post1
# %pip install torch
# %pip install IPython.display
# %pip install gc

In [2]:
# from transformers import AutoTokenizer, AutoModelForCausalLM

# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token="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")
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token="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")

In [3]:
import pandas as pd
from fpdf import FPDF
from ipywidgets import FileUpload, Button
from IPython.display import display
from io import StringIO
import torch
import gc
import nltk
import os
from transformers import AutoTokenizer, AutoModelForCausalLM
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
def read_csv(file):
    try:
        # Use BytesIO to read the file correctly
        file.stream.seek(0)
        file_content = file.read()  # Read file content
        df = pd.read_csv(BytesIO(file_content), sep='\t')  # Use BytesIO to pass the content
        return df
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

def read_docx(file):
    try:
        file.stream.seek(0)
        doc = Document(BytesIO(file.read()))
        paragraphs = [p.text for p in doc.paragraphs if p.text.strip() != '']
        return "\n".join(paragraphs)
    except Exception as e:
        print(f"Error reading DOCX file: {e}")
        return None

def read_pdf(file):
    try:
        file.stream.seek(0)
        reader = PdfReader(BytesIO(file.read()))
        text = []
        for page in reader.pages:
            text.append(page.extract_text())
        return "\n".join(filter(None, text))
    except Exception as e:
        print(f"Error reading PDF file: {e}")
        return None

In [5]:
def extract_rows_with_sprecher(df, sprecher_prefix):
    df = df.dropna(subset=['Sprecher'])
    filtered_rows = df[df['Sprecher'].str.startswith(sprecher_prefix)]
    transkript_list = filtered_rows['Transkript'].tolist()
    return transkript_list

def transkript_to_string(transkript_list):
    return "\n".join(transkript_list)

In [6]:
if torch.cuda.is_available():
    print("CUDA is available! Using GPU.")
    device = torch.device("cuda")
else:
    print("CUDA is not available. Using CPU.")
    device = torch.device("cpu")

CUDA is available! Using GPU.


In [7]:
def divide_into_chunks(text, max_words_per_chunk):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_word_count = 0

    for sentence in sentences:
        words_in_sentence = len(word_tokenize(sentence))
        if current_word_count + words_in_sentence > max_words_per_chunk:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_word_count = words_in_sentence
        else:
            current_chunk.append(sentence)
            current_word_count += words_in_sentence

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

#Use this if the above chunking generates a lot of load on memory

# def divide_into_chunks(text, max_words_per_chunk):
#     words = word_tokenize(text)
#     return [' '.join(words[i:i + max_words_per_chunk]) for i in range(0, len(words), max_words_per_chunk)]

In [8]:
class Summarizer:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer, self.model = self._load_model()  # Load model once in initialization
        self.temperature = 0.1
        self.max_tokens = 1024
        self.top_k = 1

    def _load_model(self):
        """Load the tokenizer and model once during initialization."""
        gc.collect()  # Clear memory
        print("Loading model...")
        token = "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"
        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=token)
        model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=token).to(self.device)
        print("Model and tokenizer loaded successfully.")
        return tokenizer, model

    def _generate(self, text, prompt):
        """Generate text using the loaded model."""
        inputs = self.tokenizer(text + prompt, return_tensors="pt").to(self.device)
        with torch.no_grad():
            outputs = self.model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],  # Add the attention mask
                max_new_tokens=self.max_tokens,
                temperature=self.temperature,
                top_k=self.top_k,
                num_return_sequences=1
            )
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)


    def generate_biography(self, input_text):
        
        self.temperature = 0.1
        self.max_tokens = 1024
        self.top_k = 1
        
        prompt = """

I would like you to generate biography of an interviewee based on the following structured questions in German language and  written in third-person-view, make sure it is in german language only. Please address each question thoroughly, ensuring that the narrative flows smoothly from one life stage to the next. The biography should include the following information:

Birth and Early Family Life:

When and where was the interviewee born? Include the date and location of birth.
Who are the interviewee's parents? Provide their names, backgrounds, and any relevant details about their lives.
Does the interviewee have any siblings? If so, provide details about them, including names and relationships.
Education:

Which school or schools did the interviewee attend? Mention the date, names of the institutions, locations, and any significant experiences or achievements during their education.
Career and Professional Life:

What profession did the interviewee learn or train for? Mention date and describe the nature of their training or education in this field.
Which jobs or professions has the interviewee practiced? Include details about the dates, roles, companies, or organizations they worked for, and any significant milestones or achievements in their career.
Life Events and Personal Milestones:

What were the formative or significant life events with years mentioned in the interviewee's childhood? Mention dates, Include any experiences that had a lasting impact.
What were the formative or significant life events with years mentioned during the interviewee's adolescence?  Mention dates, Describe how these events influenced their path in life.
What were the formative or significant life events with years mentioned in the interviewee's early adult years? Include details about any dates, transitions, challenges, or accomplishments during this period.
What were the formative or significant life events with years mentioned during the interviewee's adult years?  Mention dates, Describe key experiences that shaped their personal or professional life.
What were the formative or significant life events with years mentioned in the interviewee's late adult years? Highlight any dates, major changes, achievements, or reflections during this time.
Personal Life:

Did the interviewee marry  with years mentioned? If yes, provide details about their spouse,dates, including the name and any significant information about their relationship.
Does the interviewee have children with years mentioned? If so, provide details about their children, dates, including names and any significant life events related to them.
Significant Life Events:

What are the most significant life events that have shaped the interviewee's life with years mentioned? Reflect on how these events with years mentioned impacted their personal growth, relationships, or career, dates.
Please ensure the biography is coherent, chronological, detailed, and presents a well-rounded view of the interviewee's life journey with years mentioned. Include years, don't forget any years mentioned in the interview.
   
           """
        
        chunks = divide_into_chunks(input_text, max_words_per_chunk=25000)
        biography_parts = [self._generate(chunk, prompt) for chunk in chunks]
        return " ".join(biography_parts)



    def extend_biography(self, partial_biography, input_text):
        self.temperature = 0.1
        self.max_tokens = 1024
        self.top_k = 1

        prompt = f"""
            Hier ist der erste Teil der Biografie: {partial_biography}
I would like you to generate biography of an interviewee based on the following structured questions in German language and  written in third-person-view, make sure it is in german language only. Please address each question thoroughly, ensuring that the narrative flows smoothly from one life stage to the next. The biography should include the following information:

Birth and Early Family Life:

When and where was the interviewee born? Include the date and location of birth.
Who are the interviewee's parents? Provide their names, backgrounds, and any relevant details about their lives.
Does the interviewee have any siblings? If so, provide details about them, including names and relationships.
Education:

Which school or schools did the interviewee attend? Mention the date, names of the institutions, locations, and any significant experiences or achievements during their education.
Career and Professional Life:

What profession did the interviewee learn or train for? Mention date and describe the nature of their training or education in this field.
Which jobs or professions has the interviewee practiced? Include details about the dates, roles, companies, or organizations they worked for, and any significant milestones or achievements in their career.
Life Events and Personal Milestones:

What were the formative or significant life events with years mentioned in the interviewee's childhood? Mention dates, Include any experiences that had a lasting impact.
What were the formative or significant life events with years mentioned during the interviewee's adolescence?  Mention dates, Describe how these events influenced their path in life.
What were the formative or significant life events with years mentioned in the interviewee's early adult years? Include details about any dates, transitions, challenges, or accomplishments during this period.
What were the formative or significant life events with years mentioned during the interviewee's adult years?  Mention dates, Describe key experiences that shaped their personal or professional life.
What were the formative or significant life events with years mentioned in the interviewee's late adult years? Highlight any dates, major changes, achievements, or reflections during this time.
Personal Life:

Did the interviewee marry  with years mentioned? If yes, provide details about their spouse,dates, including the name and any significant information about their relationship.
Does the interviewee have children with years mentioned? If so, provide details about their children, dates, including names and any significant life events related to them.
Significant Life Events:

What are the most significant life events that have shaped the interviewee's life with years mentioned? Reflect on how these events with years mentioned impacted their personal growth, relationships, or career, dates.
Please ensure the biography is coherent, chronological, detailed, and presents a well-rounded view of the interviewee's life journey with years mentioned. Include years, don't forget any years mentioned in the interview.
   
            """

        chunks = divide_into_chunks(input_text, max_words_per_chunk=25000)
        biography_parts = [self._generate(chunk, prompt) for chunk in chunks]
        return " ".join(biography_parts)



    def refine_biography_to_500_words(self, extended_biography, input_text):

        self.temperature = 0.1
        self.max_tokens = 1024
        self.top_k = 1

        prompt = f"""

            Hier ist der erste Teil der Biografie: {partial_biography}
I would like you to generate biography of an interviewee based on the following structured questions in German language and  written in third-person-view, make sure it is in german language only. Please address each question thoroughly, ensuring that the narrative flows smoothly from one life stage to the next. The biography should include the following information:

Birth and Early Family Life:

When and where was the interviewee born? Include the date and location of birth.
Who are the interviewee's parents? Provide their names, backgrounds, and any relevant details about their lives.
Does the interviewee have any siblings? If so, provide details about them, including names and relationships.
Education:

Which school or schools did the interviewee attend? Mention the date, names of the institutions, locations, and any significant experiences or achievements during their education.
Career and Professional Life:

What profession did the interviewee learn or train for? Mention date and describe the nature of their training or education in this field.
Which jobs or professions has the interviewee practiced? Include details about the dates, roles, companies, or organizations they worked for, and any significant milestones or achievements in their career.
Life Events and Personal Milestones:

What were the formative or significant life events with years mentioned in the interviewee's childhood? Mention dates, Include any experiences that had a lasting impact.
What were the formative or significant life events with years mentioned during the interviewee's adolescence?  Mention dates, Describe how these events influenced their path in life.
What were the formative or significant life events with years mentioned in the interviewee's early adult years? Include details about any dates, transitions, challenges, or accomplishments during this period.
What were the formative or significant life events with years mentioned during the interviewee's adult years?  Mention dates, Describe key experiences that shaped their personal or professional life.
What were the formative or significant life events with years mentioned in the interviewee's late adult years? Highlight any dates, major changes, achievements, or reflections during this time.
Personal Life:

Did the interviewee marry  with years mentioned? If yes, provide details about their spouse,dates, including the name and any significant information about their relationship.
Does the interviewee have children with years mentioned? If so, provide details about their children, dates, including names and any significant life events related to them.
Significant Life Events:

What are the most significant life events that have shaped the interviewee's life with years mentioned? Reflect on how these events with years mentioned impacted their personal growth, relationships, or career, dates.
Please ensure the biography is coherent, chronological, detailed, and presents a well-rounded view of the interviewee's life journey with years mentioned. Include years, don't forget any years mentioned in the interview.
  

            """

        chunks = divide_into_chunks(input_text, max_words_per_chunk=25000)
        biography_parts = [self._generate(chunk, prompt) for chunk in chunks]
        return " ".join(biography_parts)



    def remove_incomplete_sentence(self, biography):
        words = nltk.word_tokenize(biography)
        if len(words) <= 800:
            return biography
        
        truncated_words = words[:800]
        truncated_text = " ".join(truncated_words)
        last_full_stop_index = truncated_text.rfind('.')
        
        if last_full_stop_index != -1:
            return truncated_text[:last_full_stop_index + 1]
        else:
            return truncated_text

In [9]:
# Function to save the generated biography as a PDF
def save_text_to_pdf(text, pdf_path):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    
    for line in text.split('\n'):
        pdf.multi_cell(0, 10, line)
    
    pdf.output(pdf_path)
    print(f"Biography saved as {pdf_path}")

In [10]:
# import os
# import pandas as pd
# from nltk.tokenize import sent_tokenize, word_tokenize

# # Function to handle reading CSV, DOCX, or PDF
# def process_file_and_update_status(file_path):
#     file_name = os.path.basename(file_path)  # Extract only the file name
#     try:
#         transcript_data = None

#         if file_name.endswith('.csv'):
#             # Specify the correct delimiter as tab (\t)
#             df = pd.read_csv(file_path, sep='\t', on_bad_lines='skip')
            
#             # Print the column names to check what's in the CSV
#             print("Available columns in the CSV: ", df.columns)

#             if 'Transkript' in df.columns:
#                 df['Transkript'] = df['Transkript'].fillna('').astype(str)
#                 transcript_data = "\n".join(df['Transkript'].tolist())
#             else:
#                 print(f"Error: 'Transkript' column not found in the CSV.")
#                 return
#         elif file_name.endswith('.docx'):
#             transcript_data = read_docx(file_path)
#         elif file_name.endswith('.pdf'):
#             transcript_data = read_pdf(file_path)
#         else:
#             print("Unsupported file format.")
#             return

#         if transcript_data:
#             summarizer = Summarizer()
#             biography = summarizer.generate_biography(transcript_data)
#             extend_biography = summarizer.extend_biography(biography)
#             final_biography = summarizer.remove_incomplete_sentence(extend_biography)

#             output_pdf_path = os.path.join('processed', file_name.rsplit('.', 1)[0] + '.pdf')
#             print(f"PDF saved at: {os.path.abspath(output_pdf_path)}")

#             save_text_to_pdf(final_biography, output_pdf_path)

#         else:
#             print("No transcript data found. Status: Failed.")

#     except Exception as e:
#         print(f"Error processing file {file_name}: {e}")

# # Example usage
# file_path = '/home/jovyan/adg0001_er_2024_04_23.csv'
# process_file_and_update_status(file_path)

Path in datalabs
/home/jovyan/

In [11]:
import torch

torch.cuda.empty_cache()

In [12]:
# Function to handle reading CSV, DOCX, or PDF
def process_file_and_update_status(file_path):
    file_name = os.path.basename(file_path)  # Extract only the file name
    try:
        transcript_data = None

        if file_name.endswith('.csv'):
            # Specify the correct delimiter as tab (\t)
            df = pd.read_csv(file_path, sep='\t', on_bad_lines='skip')
            
            # Print the column names to check what's in the CSV
            print("Available columns in the CSV: ", df.columns)

            if 'Transkript' in df.columns:
                df['Transkript'] = df['Transkript'].fillna('').astype(str)
                transcript_data = "\n".join(df['Transkript'].tolist())
            else:
                print(f"Error: 'Transkript' column not found in the CSV.")
                return
        elif file_name.endswith('.docx'):
            transcript_data = read_docx(file_path)
        elif file_name.endswith('.pdf'):
            transcript_data = read_pdf(file_path)
        else:
            print("Unsupported file format.")
            return

        # Debug Step 1: Print transcript_data
        if transcript_data:
            print(f"Transcript Data: {transcript_data[:500]}")  # Print first 500 characters to confirm
        else:
            print("No transcript data found.")
            return

        # Debug Step 2: Track biography generation
        summarizer = Summarizer()
        biography = summarizer.generate_biography(transcript_data)
        print(f"Generated Biography: {biography}")  # Print first 500 characters of biography

        # Extending biography
        extend_biography = summarizer.extend_biography(biography, transcript_data)
        
        # Removing incomplete sentences
        final_biography = summarizer.remove_incomplete_sentence(extend_biography, transcript_data)

        # Debug Step 3: Check final biography before saving to PDF
        print(f"Final Biography: {final_biography[:500]}")  # Print first 500 characters of final biography

        # Saving final biography to PDF
        output_pdf_path = os.path.join('processed', file_name.rsplit('.', 1)[0] + '.pdf')
        print(f"PDF will be saved at: {os.path.abspath(output_pdf_path)}")
        save_text_to_pdf(final_biography, output_pdf_path)

    except Exception as e:
        print(f"Error processing file {file_name}: {e}")

# Example usage
file_path = '/home/jovyan/adg0001_er_2024_04_23.csv'
process_file_and_update_status(file_path)

Available columns in the CSV:  Index(['Band', 'Timecode', 'Sprecher', 'Transkript', 'Übersetzung',
       'Hauptüberschrift', 'Zwischenüberschrift',
       'Hauptüberschrift (Übersetzung)', 'Zwischenüberschrift (Übersetzung)',
       'Registerverknüpfungen', 'Anmerkungen', 'Anmerkungen (Übersetzung)'],
      dtype='object')
Transcript Data: Können wir anfangen?
Also wäre schön, wenn Sie mit Kindheit beginnen würden.
Ich war das erste Enkelkind, einzige Enkelkind, lange Zeit und bin sehr verwöhnt worden, da ich ziemlich viel bei der Großmutter gewesen bin.
Ich bin gebürtig aus Hemer im Sauerland und bin 29.5.25 geboren.
Und, ja, Kindheit verlief eigentlich an und für sich normal, habe allerdings ziemlich unter Migräne zu leiden gehabt schon als Vorschulkind, was sich während der Schulzeit verschlechterte und verschlimmerte.
Ich hätt
Loading model...




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Model and tokenizer loaded successfully.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Generated Biography: Können wir anfangen? Also wäre schön, wenn Sie mit Kindheit beginnen würden. Ich war das erste Enkelkind, einzige Enkelkind, lange Zeit und bin sehr verwöhnt worden, da ich ziemlich viel bei der Großmutter gewesen bin. Ich bin gebürtig aus Hemer im Sauerland und bin 29.5.25 geboren. Und, ja, Kindheit verlief eigentlich an und für sich normal, habe allerdings ziemlich unter Migräne zu leiden gehabt schon als Vorschulkind, was sich während der Schulzeit verschlechterte und verschlimmerte. Ich hätte gerne die höhere Schule besucht, durfte aber nicht, weil mir immer gesagt wurde, ich sei krank. Die Schulentlassung, also den Hauptschulabschluss, habe ich gemacht 1939. Und damals war das Pflichtjahr bzw. Haushaltsjahr, eh,  musste gema…, musste gemacht werden, ob man die Schule, äh, besuchte oder bevor man den Beruf ergreifen konnte, so. Und da meine Großeltern ziemlich, sagen wir mal, einen kleinen Dünkel hatten, durfte ich nicht in einen Einzelhaushalt, wie das üblich 