<a href="https://colab.research.google.com/github/balavishnu266/Leveraging_LLM_for_Text_Summaries/blob/main/text_summarization_thesis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install striprtf

Collecting striprtf
  Downloading striprtf-0.0.26-py3-none-any.whl (6.9 kB)
Installing collected packages: striprtf
Successfully installed striprtf-0.0.26


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
from striprtf.striprtf import rtf_to_text

file_path = '/content/gdrive/My Drive/german_transcript.rtf'
with open(file_path, 'r') as file:
    rtf_content = file.read()
    text = rtf_to_text(rtf_content)
    print(text)

Interviewtranskript „WiSe22-Bar01“
Interviewcode: WiSe22-Bar01
Datum und Uhrzeit des Interviews: 28.01.2023
Dauer des Interviews (Minuten): 92
Interview durchgeführt von: Bar
Interview transkribiert von: Bar

Synonym der interviewten Person (Kürzel): Marie (M)
Geschlecht der interviewten Person: weiblich
Alter der interviewten Person: 20
Studiengang der interviewten Person: Lehramtsstudium
Fachsemesterzahl der interviewten Person: 3

Transkriptionsregeln: Transkript verbatim mit Sprechpausen, Interjektionen und Verzögerungslauten. Anmerkungen sind in eckigen Klammern gegeben. Auslassungen und längere Sprechpausen sind mit […] bzw. [13s] angezeigt. Notwendige Anonymisierungen wurden ebenfalls als Anmerkungen oder Auslassungen vorgenommen.
Sonstige Anmerkungen und Beobachtungen zur Interviewsituation: Bei Minute 19 gab es eine Unterbrechung des Interviews, aufgrund der Türklingel. Das Interview wurde nach paar Minuten weitergeführt. 
Synonyme: Im Folgenden werden der Interviewer mit I un

In [4]:
# Install spacy
!pip install spacy



In [5]:
# Download the German language model for spacy
!python -m spacy download de_core_news_sm

Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


In [6]:
# Now you can import spacy and use it for preprocessing
import spacy
from spacy.lang.de.stop_words import STOP_WORDS

In [7]:
# Load the German model
nlp = spacy.load("de_core_news_sm")

In [8]:
#Tokenization and Lemmatization
doc = nlp(text)
tokens_and_lemmas = [(token.text, token.lemma_) for token in doc]

In [9]:
#Handling umlauts and special characters
def replace_umlauts(text):
    replacements = {
        'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss',
        'Ä': 'Ae', 'Ö': 'Oe', 'Ü': 'Ue'
    }
    for k, v in replacements.items():
        text = text.replace(k, v)
    return text

In [10]:
#Covering the Identities of People and places in the Interview
import spacy

nlp = spacy.load("de_core_news_sm")

def anonymize_text(text):
    doc = nlp(text)
    anonymized_text = text
    for ent in doc.ents:
        if ent.label_ in ["PER", "LOC", "ORG"]:
            anonymized_text = anonymized_text.replace(ent.text, f"[{ent.label_}]")
    return anonymized_text

In [11]:
!pip install language_tool_python

Collecting language_tool_python
  Downloading language_tool_python-2.7.1-py3-none-any.whl (34 kB)
Installing collected packages: language_tool_python
Successfully installed language_tool_python-2.7.1


In [12]:
import language_tool_python

def quality_filter(text, lang='de-DE'):
    tool = language_tool_python.LanguageTool(lang)
    matches = tool.check(text)
    corrected_text = language_tool_python.utils.correct(text, matches)
    return corrected_text

In [13]:
def structure_for_summarization(text):
    # Split the text into lines as each line represents a part of the dialogue
    lines = text.split('\n')

    structured_segments = []
    current_segment = {"question": "", "answer": ""}

    for line in lines:
        # Check if the line is from the interviewer (I) or the interviewee (M)
        if line.startswith('I:'):
            # If there's an ongoing segment, append it to the structured_segments list
            if current_segment["question"]:
                structured_segments.append(current_segment)
                current_segment = {"question": "", "answer": ""}
            # Add the question part to the current segment
            current_segment["question"] = line[3:]  # Exclude the 'I: ' part
        elif line.startswith('M:'):
            # Add the answer part to the current segment
            current_segment["answer"] += line[3:] + " "  # Exclude the 'M: ' part and add a space for readability

    # Append the last segment if it exists
    if current_segment["question"]:
        structured_segments.append(current_segment)

    return structured_segments

In [15]:
def remove_stopwords(text):
    doc = nlp(text)
    return " ".join([token.text for token in doc if not token.is_stop])

In [16]:
#Preparing the text for Summarization
text = replace_umlauts(text)
text = anonymize_text(text)
text = quality_filter(text)
structured_text = structure_for_summarization(text)
final_text = [remove_stopwords(section) for section in structured_text]

Downloading LanguageTool 5.7: 100%|██████████| 225M/225M [00:03<00:00, 67.0MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmptinxn5j4.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://www.languagetool.org/download/LanguageTool-5.7.zip to /root/.cache/language_tool_python.


In [20]:
!pip install --upgrade openai

Collecting openai
  Downloading openai-1.12.0-py3-none-any.whl (226 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/226.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━[0m [32m153.6/226.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.7/226.7 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.26.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.9/75.9 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.3-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-p