# Preprocessing

In [None]:
import ast

# TODO dynamic path handle
f = open("./transcripts/JN3KPFbWCy8", "r") 

data = f.readlines()
  
# replacing end of line('/n') with ' ' and 
# splitting the text it further when '.' is seen. 

f.close()

res = []
for d in data:
    temp = ast.literal_eval(d)
    res.append(temp)

Dataframe conversion

In [None]:
import pandas as pd
df = pd.DataFrame(res)

def get_length(row):
    return len(row['text'])

df['text_len'] = df.apply(get_length,axis=1)

df

Merge on Condition

In [None]:

import pandas as pd
df = pd.DataFrame(res)

# Calculate end time
df['end'] = df['start'] + df['duration']

# Identify rows to merge based on condition
# Marking rows that should NOT start a new group.
# TODO finetune with different ruleset
df['merge_with_next'] = df['duration'] <= 8

# Create group identifier
df['group'] = (df['merge_with_next'] == False).cumsum()

# Merge rows within the same group
aggregated = df.groupby('group').agg({
    'text': ' '.join, 
    'start': 'min',  # Take the earliest start time
    'end': 'max',  # Take the latest end time
}).reset_index(drop=True)

# Calculate new duration based on aggregated start and end times
aggregated['duration'] = aggregated['end'] - aggregated['start']

df = aggregated

'''
def get_length(row):
    return len(row['text'])

df['text_len'] = df.apply(get_length,axis=1)

df3 = df[df['text_len'] > 20]  

df = df3
'''

df

Named Entity Recognition (NER)

In [None]:

# Editing tokenizer
# https://spacy.io/usage/linguistic-features#native-tokenizers
# https://spacy.io/usage/linguistic-features#special-cases

# Sentance Segmentation
# https://spacy.io/usage/linguistic-features#sbd

# Training
# https://spacy.io/usage/training

# Model Constraints
# https://github.com/explosion/spaCy/issues/3052

import spacy

# Run this:
# $ python3 -m spacy download en
nlp = spacy.load("en_core_web_lg")

In [None]:
# Label
# https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf
labels_spacy = """
PERSON                        People, including fictional
NORP                          Nationalities or religious or political groups
FACILITY                      Buildings, airports, highways, bridges, etc.
ORGANIZATION                  Companies, agencies, institutions, etc.
GPE                           Countries, cities, states
LOCATION                      Non-GPE locations, mountain ranges, bodies of water
PRODUCT                       Vehicles, weapons, foods, etc. (Not services)
EVENT                         Named hurricanes, battles, wars, sports events, etc.
WORK OF ART                   Titles of books, songs, etc.
LAW                           Named documents made into laws
LANGUAGE                      Any named language
DATE                          Absolute or relative dates or periods
TIME                          Times smaller than a day
PERCENT                       Percentage (including “%”)
MONEY                         Monetary values, including unit
QUANTITY                      Measurements, as of weight or distance
ORDINAL                       “first”, “second”
CARDINAL                      Numerals that do not fall under another type
"""

label_lookup_table = {
    "PERSON": "People, including fictional",
    "NORP": "Nationalities or religious or political groups",
    "FACILITY": "Buildings, airports, highways, bridges, etc.",
    "ORGANIZATION": "Companies, agencies, institutions, etc.",
    "GPE": "Countries, cities, states",
    "LOCATION": "Non-GPE locations, mountain ranges, bodies of water",
    "PRODUCT": "Vehicles, weapons, foods, etc. (Not services)",
    "EVENT": "Named hurricanes, battles, wars, sports events, etc.",
    "WORK OF ART": "Titles of books, songs, etc.",
    "LAW": "Named documents made into laws",
    "LANGUAGE": "Any named language",
    "DATE": "Absolute or relative dates or periods",
    "TIME": "Times smaller than a day",
    "PERCENT": "Percentage (including “%”)",
    "MONEY": "Monetary values, including unit",
    "QUANTITY": "Measurements, as of weight or distance",
    "ORDINAL": "“first”, “second”",
    "CARDINAL": "Numerals that do not fall under another type"
}


In [None]:
print("Entities of Interest")
"""
# example: fetching entity on a single row
test = df['text'][1]
doc = nlp(test)

entities = []
for ent in doc.ents:
    entities.append((ent.text, ent.start_char, ent.end_char, ent.label_))
    print(ent.text, ent.start_char, ent.end_char, ent.label_)
"""

def get_entity_values(data):
    doc = nlp(data['text'])
    # Extract entity details and return as a list of tuples
    return [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]

df['entities'] = df.apply(get_entity_values, axis=1)

df['entities']

Output

In [None]:
df.to_csv('out.csv', columns=['text', 'start', 'end', 'duration', 'entities'], index=False) 

Text Summary Model

In [None]:
from transformers import pipeline

#TODO cache this!!!
pipe = pipeline("summarization", model="Falconsai/text_summarization")
# pipe = pipeline("summarization", model="facebook/bart-large-cnn")

In [None]:
def summarize(row):
    # max_len -> 15
    return pipe(row['text'], max_length=100, min_length=100, do_sample=False)

df['summary'] = df.apply(summarize, axis=1)