# Text Preprocessing

Example output dataset
| Text | start | end | duration | text_len | sentiment | entities |
| ---- | ----- | --- | -------- | -------- | --------- | -------- |
| foobar | 0.08 | | 5.360 | 6 | foo_sentiment | foo_entities | 
| barfoobar | 2.04 | | 6.830 | 9 | foo_sentiment | foo_entities |
| heehoo | 21.098 | | 2.10 | 6 | foo_sentiment | foo_entities |

note: summary might be something that is irrelevent until clipped at the end of the pipeline?

In [12]:
import os
### === Error Handles === ###
def perror(msg):
    print("error: " + msg)

### === Helpers === ###
# Create file and set timestamp
def touch(path):
    with open(path, 'a') as f:
        os.utime(path, None) # set access and modified times
        f.close()

def mkdir(path):
    if not os.path.exists(path):
        os.makedirs(path)
        return path

    print("path already exists")
    return

## Transcript

In [13]:

# TODO DYNAMIC PATH HANDLE
INPUT_FILE = "test1.mp4"

INPUT_DIR = './input_data'
OUTPUT_DIR = './cache/' # target output for preoprocessing is cache

root, extention  = os.path.splitext(INPUT_FILE)
# Specify the file path where you want to save the JSON data
OUTPUT_FILE = OUTPUT_DIR + root + ".json"

mkdir(OUTPUT_DIR)

path already exists


In [14]:
import whisper_timestamped as whisper # 
import json

def get_transcript(input_file, output_file):
    audio = whisper.load_audio(input_file)
    model = whisper.load_model("base")

    result = whisper.transcribe(model, audio, language="en")

    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(result, file, indent=2, ensure_ascii=False)

    return result

# Generate transcript if it does not exist in cache
if not os.path.isfile(OUTPUT_FILE):
    try:
        transcript = get_transcript(INPUT_FILE, OUTPUT_FILE)
    except:
        perror("unable to generate transcript")
else: 
    try:
        with open(OUTPUT_FILE, 'r') as f:
            transcript = f.read()
            transcript = json.loads(transcript)
    except:
        perror("unable to load data from cache")

#### Dataframe

In [15]:
import pandas as pd
df = pd.DataFrame(transcript)

def get_text(row):
    return row['segments']['text']

def get_length(row):
    return len(row['segments']['text'])

def get_start(row):
    return row['segments']['start']

def get_end(row):
    return row['segments']['end']

df['text'] = df.apply(get_text,axis=1)
df['text_len'] = df.apply(get_length,axis=1)
df['start'] = df.apply(get_start,axis=1)
df['end'] = df.apply(get_end,axis=1)

df

Unnamed: 0,text,segments,language,text_len,start,end
0,"Well, that's a super powerful idea of generat...","{'id': 0, 'seek': 0, 'start': 3.06, 'end': 11....",en,96,3.06,11.52
1,for you.,"{'id': 1, 'seek': 0, 'start': 12.26, 'end': 12...",en,9,12.26,12.86
2,Yeah.,"{'id': 2, 'seek': 0, 'start': 13.78, 'end': 13...",en,6,13.78,13.92
3,That works to find the best customer for your...,"{'id': 3, 'seek': 0, 'start': 13.92, 'end': 17...",en,53,13.92,17.70
4,"I mean, to me, advertisement went done well.","{'id': 4, 'seek': 0, 'start': 18.16, 'end': 20...",en,45,18.16,20.64
...,...,...,...,...,...,...
228,then we forget how to do all the stuff that t...,"{'id': 228, 'seek': 85400, 'start': 854.0, 'en...",en,54,854.00,855.94
229,It's a weird trade off.,"{'id': 229, 'seek': 85400, 'start': 858.16, 'e...",en,24,858.16,859.46
230,Yeah.,"{'id': 230, 'seek': 85400, 'start': 859.54, 'e...",en,6,859.54,859.76
231,I agree.,"{'id': 231, 'seek': 85400, 'start': 860.24, 'e...",en,9,860.24,860.70


## Sentiment

Model: [SamLowe/roberta-base-go_emotions](https://huggingface.co/SamLowe/roberta-base-go_emotions) 

[ONNX Variant](https://huggingface.co/SamLowe/roberta-base-go_emotions-onnx)

Model trained from [roberta-base](https://huggingface.co/roberta-base) on the [go_emotions](https://huggingface.co/datasets/go_emotions) dataset for multi-label classification.


In [16]:
from transformers import pipeline

sentiment_pipe = pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None)

#### Add sentiment analysis to dataframe

In [17]:
def get_sentiment(row):
    return sentiment_pipe(row['text'])[0] # produces a list of dicts for each label
df['sentiment'] = df.apply(get_sentiment, axis=1)

df

Unnamed: 0,text,segments,language,text_len,start,end,sentiment
0,"Well, that's a super powerful idea of generat...","{'id': 0, 'seek': 0, 'start': 3.06, 'end': 11....",en,96,3.06,11.52,"[{'label': 'admiration', 'score': 0.7806823253..."
1,for you.,"{'id': 1, 'seek': 0, 'start': 12.26, 'end': 12...",en,9,12.26,12.86,"[{'label': 'neutral', 'score': 0.9666472077369..."
2,Yeah.,"{'id': 2, 'seek': 0, 'start': 13.78, 'end': 13...",en,6,13.78,13.92,"[{'label': 'neutral', 'score': 0.9317912459373..."
3,That works to find the best customer for your...,"{'id': 3, 'seek': 0, 'start': 13.92, 'end': 17...",en,53,13.92,17.70,"[{'label': 'neutral', 'score': 0.7166603803634..."
4,"I mean, to me, advertisement went done well.","{'id': 4, 'seek': 0, 'start': 18.16, 'end': 20...",en,45,18.16,20.64,"[{'label': 'admiration', 'score': 0.8606323599..."
...,...,...,...,...,...,...,...
228,then we forget how to do all the stuff that t...,"{'id': 228, 'seek': 85400, 'start': 854.0, 'en...",en,54,854.00,855.94,"[{'label': 'neutral', 'score': 0.8300427794456..."
229,It's a weird trade off.,"{'id': 229, 'seek': 85400, 'start': 858.16, 'e...",en,24,858.16,859.46,"[{'label': 'disgust', 'score': 0.4792686104774..."
230,Yeah.,"{'id': 230, 'seek': 85400, 'start': 859.54, 'e...",en,6,859.54,859.76,"[{'label': 'neutral', 'score': 0.9317912459373..."
231,I agree.,"{'id': 231, 'seek': 85400, 'start': 860.24, 'e...",en,9,860.24,860.70,"[{'label': 'approval', 'score': 0.922422111034..."


## Entity Recognition

In [19]:
import spacy

# Run this:
# $ python3 -m spacy download en
nlp = spacy.load("en_core_web_lg")

In [20]:
# Label
# https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf
labels_spacy = """
PERSON                        People, including fictional
NORP                          Nationalities or religious or political groups
FACILITY                      Buildings, airports, highways, bridges, etc.
ORGANIZATION                  Companies, agencies, institutions, etc.
GPE                           Countries, cities, states
LOCATION                      Non-GPE locations, mountain ranges, bodies of water
PRODUCT                       Vehicles, weapons, foods, etc. (Not services)
EVENT                         Named hurricanes, battles, wars, sports events, etc.
WORK OF ART                   Titles of books, songs, etc.
LAW                           Named documents made into laws
LANGUAGE                      Any named language
DATE                          Absolute or relative dates or periods
TIME                          Times smaller than a day
PERCENT                       Percentage (including “%”)
MONEY                         Monetary values, including unit
QUANTITY                      Measurements, as of weight or distance
ORDINAL                       “first”, “second”
CARDINAL                      Numerals that do not fall under another type
"""

label_lookup_table = {
    "PERSON": "People, including fictional",
    "NORP": "Nationalities or religious or political groups",
    "FACILITY": "Buildings, airports, highways, bridges, etc.",
    "ORGANIZATION": "Companies, agencies, institutions, etc.",
    "GPE": "Countries, cities, states",
    "LOCATION": "Non-GPE locations, mountain ranges, bodies of water",
    "PRODUCT": "Vehicles, weapons, foods, etc. (Not services)",
    "EVENT": "Named hurricanes, battles, wars, sports events, etc.",
    "WORK OF ART": "Titles of books, songs, etc.",
    "LAW": "Named documents made into laws",
    "LANGUAGE": "Any named language",
    "DATE": "Absolute or relative dates or periods",
    "TIME": "Times smaller than a day",
    "PERCENT": "Percentage (including “%”)",
    "MONEY": "Monetary values, including unit",
    "QUANTITY": "Measurements, as of weight or distance",
    "ORDINAL": "“first”, “second”",
    "CARDINAL": "Numerals that do not fall under another type"
}

#### Final Cleanup

In [18]:
# Drop language and segments
df = df.drop(columns='language')
df = df.drop(columns='segments')

df

Unnamed: 0,text,text_len,start,end,sentiment
0,"Well, that's a super powerful idea of generat...",96,3.06,11.52,"[{'label': 'admiration', 'score': 0.7806823253..."
1,for you.,9,12.26,12.86,"[{'label': 'neutral', 'score': 0.9666472077369..."
2,Yeah.,6,13.78,13.92,"[{'label': 'neutral', 'score': 0.9317912459373..."
3,That works to find the best customer for your...,53,13.92,17.70,"[{'label': 'neutral', 'score': 0.7166603803634..."
4,"I mean, to me, advertisement went done well.",45,18.16,20.64,"[{'label': 'admiration', 'score': 0.8606323599..."
...,...,...,...,...,...
228,then we forget how to do all the stuff that t...,54,854.00,855.94,"[{'label': 'neutral', 'score': 0.8300427794456..."
229,It's a weird trade off.,24,858.16,859.46,"[{'label': 'disgust', 'score': 0.4792686104774..."
230,Yeah.,6,859.54,859.76,"[{'label': 'neutral', 'score': 0.9317912459373..."
231,I agree.,9,860.24,860.70,"[{'label': 'approval', 'score': 0.922422111034..."


### Export

In [None]:
df.to_csv(OUTPUT_DIR + 'out_text_preprocessing.csv', columns=['text', 'start', 'end', 'duration', 'sentiment', 'entities'], index=False) 