In [8]:
import pandas as pd
import nltk
import aiohttp
import asyncio
from nltk.tokenize import word_tokenize, sent_tokenize
from langchain_together import Together
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from reportlab.lib.utils import simpleSplit
from docx import Document
import os
import time

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asha4\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
# Function to extract rows from a DataFrame where 'Sprecher' column starts with the specified prefix.
def extract_rows_with_sprecher(df, sprecher_prefix):
    df = df.dropna(subset=['Sprecher'])
    filtered_rows = df[df['Sprecher'].str.startswith(sprecher_prefix)]
    transkript_list = filtered_rows['Transkript'].tolist()
    return transkript_list

In [10]:
# Function to convert the 'Transkript' column to a single string.
def transkript_to_string(transkript_list):
    return "\n".join(transkript_list)

In [11]:
# Function to divide text into chunks close to a specified maximum number of words without cutting through sentences.
def divide_into_chunks(text, max_words_per_chunk):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_word_count = 0

    for sentence in sentences:
        words_in_sentence = len(word_tokenize(sentence))
        if current_word_count + words_in_sentence > max_words_per_chunk:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_word_count = words_in_sentence
        else:
            current_chunk.append(sentence)
            current_word_count += words_in_sentence

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

In [12]:
class Summarizer:
    def __init__(self):
        self.llm = Together(
            model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
            temperature=0.4,
            max_tokens= 2048,
            top_k=1,
            together_api_key="809750d4770635c394317b2afb1baefa7173070c968384d8ed26c9a0a1e72a10"
        )
        self.llm2 = Together(
            model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
            temperature=0.4,
            max_tokens= 512,
            top_k=1,
            together_api_key="809750d4770635c394317b2afb1baefa7173070c968384d8ed26c9a0a1e72a10"
        )
        self.llm1 = Together(
            model="mistralai/Mixtral-8x7B-Instruct-v0.1",
            temperature=0.4,
            max_tokens= 300,
            top_k=1,
            together_api_key="809750d4770635c394317b2afb1baefa7173070c968384d8ed26c9a0a1e72a10"
        )

    def generate_biography(self, input_text):
        prompt = """
    Du bist ein deutsches Textzusammenfassungsmodell. Erstellen Sie eine prägnante Zusammenfassung des obigen Textes in deutscher Sprache innerhalb von 500 Wörtern. Konzentrieren Sie sich auf die wichtigsten Punkte und bewahren Sie Klarheit. Mention years within the biography. Generate only on data given to the bioraphy and do not give your own opinions as an add on to the biography.
        """
        full_input = input_text + prompt
        retry_delay = 30

        while True:
            try:
                output_summary = self.llm.invoke(full_input)
                return output_summary
            except Exception as e:
                if "rate limited" in str(e).lower():
                    print(f"Rate limit exceeded. Retrying in {retry_delay} seconds...")
                    asyncio.sleep(retry_delay)
                else:
                    print(f"An error occurred: {e}")
                    raise

    def final_biography(self, input_text):
        prompt1 = """
Bitte erstellen Sie eine Biografie auf Basis des untenstehenden Textes in deutscher Sprache mit maximal 500 Wörtern. Generate only on data given and do not give your opinions.
        """
        full_input = input_text + prompt1
        retry_delay = 30

        while True:
            try:
                output_summary = self.llm1.invoke(full_input)
                return output_summary
            except Exception as e:
                if "rate limited" in str(e).lower():
                    print(f"Rate limit exceeded. Retrying in {retry_delay} seconds...")
                    asyncio.sleep(retry_delay)
                else:
                    print(f"An error occurred: {e}")
                    raise

    def refine_biography(self, input_text):
        def truncate_to_word_limit(text, limit):
            words = word_tokenize(text)
            if len(words) > limit:
                words = words[:limit]
            return ' '.join(words)

        words = word_tokenize(input_text)

        if len(words) <= 500:
            return input_text

        sentences = sent_tokenize(input_text)
        refined_text = ""
        current_word_count = 0

        for sentence in sentences:
            sentence_word_count = len(word_tokenize(sentence))
            if current_word_count + sentence_word_count <= 500:
                refined_text += sentence + " "
                current_word_count += sentence_word_count
            else:
                break

        refined_text = refined_text.strip()
        refined_text = truncate_to_word_limit(refined_text, 500)

        return refined_text

    def final_2_biography(self, input_text):
        prompt = """
        Du bist ein deutsches Textzusammenfassungsmodell. Erstellen Sie eine prägnante Zusammenfassung des oben genannten Textes auf Deutsch innerhalb von 500 Wörtern und behalten Sie alle wichtigen Daten bei. Konzentrieren Sie sich auf die wichtigsten Punkte und bewahren Sie Klarheit.
        """
        full_input = input_text + prompt
        retry_delay = 30

        while True:
            try:
                output_summary = self.llm2.invoke(full_input)
                return output_summary
            except Exception as e:
                if "rate limited" in str(e).lower():
                    print(f"Rate limit exceeded. Retrying in {retry_delay} seconds...")
                    asyncio.sleep(retry_delay)
                else:
                    print(f"An error occurred: {e}")
                    raise

In [13]:
# Function to generate a biography from text chunks.
async def generate_biography(summarizer, input_text):
    chunks = divide_into_chunks(input_text, 50000)
    summary = ""
    chunk_number = 1

    for chunk in chunks:
        print(f"Chunk {chunk_number}:")
        print(chunk)
        summary += summarizer.generate_biography(chunk) + "\n"
        chunk_number += 1
    return summary

# Function to read data from a CSV file.
async def read_csv(file_path):
    try:
        df = pd.read_csv(file_path, sep='\t')
        return df
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

# Function to read data from a DOCX file.
async def read_docx(file_path):
    try:
        doc = Document(file_path)
        data = [p.text for p in doc.paragraphs if p.text]
        return " ".join(data)
    except Exception as e:
        print(f"Error reading DOCX file: {e}")
        return None

async def final_biography(summarizer, input_text):
    chunks = divide_into_chunks(input_text, 1000)
    summary = ""
    chunk_number = 1

    for chunk in chunks:
        summary += summarizer.final_biography(chunk) + "\n"
        chunk_number += 1
    return summary

def count_words(sentence):
    words = word_tokenize(sentence)
    return len(words)

async def final_2_biography(summarizer, input_text):
    chunks = divide_into_chunks(input_text, 50000)
    summary = ""
    chunk_number = 1

    for chunk in chunks:
        summary += summarizer.final_2_biography(chunk) + "\n"
        chunk_number += 1
    return summary

def remove_incomplete_sentence(biography):
    last_full_stop_index = biography.rfind('.')
    if last_full_stop_index != -1:
        return biography[:last_full_stop_index + 1]
    else:
        return biography

In [14]:
# Main function to process the input file and generate a biography.
async def process_file(file_path):
    summarizer = Summarizer()
    
    if file_path.endswith('.csv'):
        df = await read_csv(file_path)
        if df is not None:
            sprecher_prefix = 'IP_'
            transkript_list = extract_rows_with_sprecher(df, sprecher_prefix)
            transcript_data = transkript_to_string(transkript_list)
        else:
            print("Failed to read CSV file.")
            return
    elif file_path.endswith('.docx'):
        transcript_data = await read_docx(file_path)
        if not transcript_data:
            print("Failed to read DOCX file.")
            return
    else:
        print("Unsupported file format.")
        return

    biography = await generate_biography(summarizer, transcript_data)
    print("Biography : \n ", count_words(biography), biography.strip())

    x_biography = await final_biography(summarizer, biography)
    summary = remove_incomplete_sentence(x_biography)
    print("Summary : \n ", count_words(summary), summary.strip())

    if count_words(summary) <= 600:
        refined_biography = summarizer.refine_biography(summary)
        print("Refined : \n", count_words(refined_biography), refined_biography.strip())
    else:
        x = remove_incomplete_sentence(summary)
        x1_biography = await final_2_biography(summarizer, x)
        summary1 = remove_incomplete_sentence(x1_biography)
        print("Summary 1: ", count_words(summary1), summary1.strip())

if __name__ == "__main__":
    file_path = "C:/Users/asha4/OneDrive - SRH/Case Study-1/Dennis- Files/WG_ [EXTERN]  Transcripts and Biographies/adg0001_er_2024_04_23.csv"
    #file_path = "C:/Users/asha4/OneDrive - SRH/Case Study-1/Dennis- Files/WG_ [EXTERN]  Transcripts and Biographies/adg0002_er_2024_04_23.csv"
    process_file(file_path)

  process_file(file_path)
