# Text Preprocessing

In [None]:
import os

### === Helpers === ###
def perror(msg):
    print("error: " + msg)

def touch(path):
    with open(path, 'a') as f:
        os.utime(path, None) # set access and modified times
        f.close()

def mkdir(path):
    if not os.path.exists(path):
        os.makedirs(path)
        return path

    print("path already exists")
    return

## Transcript

In [None]:
# TODO DYNAMIC PATH HANDLE
INPUT_FILE = "test1.mp4"

INPUT_DIR = './input_data'
OUTPUT_DIR = './cache/' # target output for preoprocessing is cache

root, extention  = os.path.splitext(INPUT_FILE)
OUTPUT_FILE = OUTPUT_DIR + root + ".json"

mkdir(OUTPUT_DIR)

In [None]:
import whisper_timestamped as whisper
import json

def get_transcript(input_file, output_file):
    audio = whisper.load_audio(input_file)
    model = whisper.load_model("base")

    result = whisper.transcribe(model, audio, language="en")

    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(result, file, indent=2, ensure_ascii=False)

    return result

# Generate transcript if it does not exist in cache
if not os.path.isfile(OUTPUT_FILE):
    try:
        transcript = get_transcript(INPUT_FILE, OUTPUT_FILE)
    except:
        perror("unable to generate transcript")
else: 
    try:
        with open(OUTPUT_FILE, 'r') as f:
            transcript = f.read()   
            transcript = json.loads(transcript)
    except:
        perror("unable to load data from cache")

#### Dataframe

In [None]:
import pandas as pd
df = pd.DataFrame(transcript)

def get_text(row):
    return row['segments']['text']

def get_length(row):
    return len(row['segments']['text'])

def get_start(row):
    return row['segments']['start']

def get_end(row):
    return row['segments']['end']

def get_duration(row):
    return round(abs(row['segments']['end'] - row['segments']['start']), 2)

df['text'] = df.apply(get_text,axis=1)
df['text_len'] = df.apply(get_length,axis=1)
df['start'] = df.apply(get_start,axis=1)
df['end'] = df.apply(get_end,axis=1)
df['duration'] = df.apply(get_duration,axis=1)
df

## Sentiment

Model: [SamLowe/roberta-base-go_emotions](https://huggingface.co/SamLowe/roberta-base-go_emotions) 

[ONNX Variant](https://huggingface.co/SamLowe/roberta-base-go_emotions-onnx)

Model trained from [roberta-base](https://huggingface.co/roberta-base) on the [go_emotions](https://huggingface.co/datasets/go_emotions) dataset for multi-label classification.


In [None]:
from transformers import pipeline

sentiment_pipe = pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None)

#### Add sentiment analysis to dataframe

In [None]:
def get_sentiment(row):
    return sentiment_pipe(row['text'])[0] # produces a list of dicts for each label

df['sentiment'] = df.apply(get_sentiment, axis=1)

def unpack_sentiment(row):
    sentiment = row['sentiment']
    if sentiment:
        for label_dict in sentiment:
            label = label_dict['label'] + "_sentiment"
            score = label_dict['score']
            row[label] = score
    return row

df = df.apply(unpack_sentiment, axis=1)

df

## Entity Recognition

In [None]:
import spacy

# Run this:
# $ python3 -m spacy download en
#nlp = spacy.load("en_core_web_sm")
nlp = spacy.load("en_core_web_lg")

Labels

In [None]:
# https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf
labels_spacy = """
PERSON                        People, including fictional
NORP                          Nationalities or religious or political groups
FACILITY                      Buildings, airports, highways, bridges, etc.
ORGANIZATION                  Companies, agencies, institutions, etc.
GPE                           Countries, cities, states
LOCATION                      Non-GPE locations, mountain ranges, bodies of water
PRODUCT                       Vehicles, weapons, foods, etc. (Not services)
EVENT                         Named hurricanes, battles, wars, sports events, etc.
WORK OF ART                   Titles of books, songs, etc.
LAW                           Named documents made into laws
LANGUAGE                      Any named language
DATE                          Absolute or relative dates or periods
TIME                          Times smaller than a day
PERCENT                       Percentage (including “%”)
MONEY                         Monetary values, including unit
QUANTITY                      Measurements, as of weight or distance
ORDINAL                       “first”, “second”
CARDINAL                      Numerals that do not fall under another type
"""

label_lookup_table = {
    "PERSON": "People, including fictional",
    "NORP": "Nationalities or religious or political groups",
    "FACILITY": "Buildings, airports, highways, bridges, etc.",
    "ORGANIZATION": "Companies, agencies, institutions, etc.",
    "GPE": "Countries, cities, states",
    "LOCATION": "Non-GPE locations, mountain ranges, bodies of water",
    "PRODUCT": "Vehicles, weapons, foods, etc. (Not services)",
    "EVENT": "Named hurricanes, battles, wars, sports events, etc.",
    "WORK OF ART": "Titles of books, songs, etc.",
    "LAW": "Named documents made into laws",
    "LANGUAGE": "Any named language",
    "DATE": "Absolute or relative dates or periods",
    "TIME": "Times smaller than a day",
    "PERCENT": "Percentage (including “%”)",
    "MONEY": "Monetary values, including unit",
    "QUANTITY": "Measurements, as of weight or distance",
    "ORDINAL": "“first”, “second”",
    "CARDINAL": "Numerals that do not fall under another type"
}

#### Add entities to dataframe

In [None]:
# TODO design decision: handling empty columns, keep or toss

def get_entity_values(row):
    doc = nlp(row['text'])
    # Extract entity details and return as a list of tuples
    return [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]

df['entities'] = df.apply(get_entity_values, axis=1)

df

#### Final Cleanup

In [None]:
# Drop language and segments
df = df.drop(columns='language')
df = df.drop(columns='segments')

df

### Export

In [None]:
df.to_csv(OUTPUT_DIR + 'out_text_preprocessing.csv', index=False) 

note: summary might be something that is irrelevent until clipped at the end of the pipeline?