## This notebook will preprocess transcripts such that they are easier to work with downstream models and annotate
- Author: Bowen Yi


In [43]:
import pandas as pd
import re
import pycld2 as cld2  # to detect foreign language in transcript
import cld3

In [23]:
df = pd.read_csv("pol_trans.csv", usecols = ['transcript','enclosure','potentialOutPath','category1','category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10'])


In [24]:
df = df.dropna(subset=['transcript'])


In [25]:
df.shape


(7813, 13)

In [26]:
df = df.dropna(subset=['enclosure'])


In [27]:
df.shape


(7791, 13)

In [28]:
df = df.sample(frac=1, random_state=1)


In [29]:
def preprocess_transcript(text, words_to_remove=200, lower_bound_word_count=400):
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r'\*.*?\*', '', text)
    text = text.replace('#', '')
    text = text.encode("ascii", "ignore").decode()
    
    if len(text.split()) <= lower_bound_word_count:
        return ""
    
    # This is for chunking transcripts later. We only chunk at the end of a sentence
    sentences = re.split(r'(?<=[.!?]) ', text)
    sentences = [sentence + ' ' for sentence in sentences]
    
    # When no sentences are found, or the text doesn't contain proper sentence-ending punctuation, ignore this transcript
    if len(sentences) <= 1:
        return ""
    
    # Remove the first and last ~200 words from the transcript
    start_index = 0
    word_count = 0
    for i, sentence in enumerate(sentences):
        word_count += len(sentence.split())
        if word_count >= words_to_remove:
            start_index = i
            break
    
    # Remove the last ~200 words from the transcript
    end_index = len(sentences) - 1
    word_count = 0
    for i in range(len(sentences) - 1, -1, -1):
        word_count += len(sentences[i].split())
        if word_count >= words_to_remove:
            end_index = i
            break
    
    if end_index <= start_index or start_index == 0 or end_index == len(sentences) - 1:
        return ""

    filtered_sentences = sentences[start_index + 1:end_index]
    return filtered_sentences



In [30]:
df['processed_text'] = df['transcript'].apply(preprocess_transcript)



# Remove short transcripts


In [38]:
def too_short(text, lower_bound=200):
    length = 0
    for sentence in text:
        length += len(sentence.split())
        if length > lower_bound:
            return False
    return True


In [39]:
df["too_short"] = df["processed_text"].apply(too_short)



In [40]:
df[df["too_short"]==True].shape


(1506, 15)

In [41]:
df = df[df["too_short"]==False]

In [42]:
df.shape

(6285, 15)

## Remove non-English transcripts

In [44]:
def is_eng(trans):
    text = ""
    for sent in trans:
        text += sent
    
    isReliable, textBytesFound, details, vectors = cld2.detect(text, returnVectors=True)
    if details[0][0] == "ENGLISH" and details[1][0] == "Unknown" and details[2][0] == "Unknown":
        return True
    else:
        return False
    

In [45]:
df["is_eng"] = df['processed_text'].apply(is_eng)



In [49]:
df = df[df["is_eng"]==1]


In [51]:
df = df.drop(['is_eng', 'transcript','too_short'], axis=1)


In [52]:
df.columns

Index(['potentialOutPath', 'enclosure', 'category1', 'category2', 'category3',
       'category4', 'category5', 'category6', 'category7', 'category8',
       'category9', 'category10', 'processed_text'],
      dtype='object')

In [53]:
df.processed_text[0]

['First, just absolutely appalled, just reading about what happened in the killing of George Floyd, what happened in terms of how the police interacted with them. ',
 'I think I was just appalled. ',
 "The other word I'd use is upset because this isn't a new issue. ",
 'This has happened for generations, but in our time, in our contemporary time, you could talk about killing of Michael Brown. ',
 'You could talk about the killing of Eric Garner. ',
 "You can list these names because they've been burned to America's national consciousness as a reminder of the systematic racism that is present, unfortunately, in this country. ",
 'Particularly, in our police institutions. ',
 'That was my immediate reaction. ',
 'Micah, I assume you have watched the video. ',
 'Unfortunately, I have. ',
 "I think one thing that I've seen a lot and I'm going to pass along this message for anyone who hasn't heard it, don't share this video. ",
 "This is an incredibly traumatic thing that's being shared. ",

## Convert processed_text into string

In [55]:
def convert_processed_to_str(text):
    res = ''
    for sentence in text:
        res += sentence
        
    return res
    

In [56]:
df['stringed_text'] = df['processed_text'].apply(convert_processed_to_str) 


In [58]:
df.columns

Index(['potentialOutPath', 'enclosure', 'category1', 'category2', 'category3',
       'category4', 'category5', 'category6', 'category7', 'category8',
       'category9', 'category10', 'processed_text', 'stringed_text'],
      dtype='object')

## Reorganize the dataframe columns

In [60]:
organized_data = {
    'transcript_to_chunk': df.processed_text,
    'path':df.potentialOutPath,
    'url':df.enclosure,
    'cat1':df.category1,
    'cat2':df.category2,
    'cat3':df.category3,
    'cat4':df.category4,
    'cat5':df.category5,
    'cat6':df.category6,
    'cat7':df.category7,
    'cat8':df.category8,
    'cat9':df.category9,
    'cat10':df.category10,
    'transcript_annotate':df.stringed_text,
#     'human_label':''
#     'note': ''
    
}

df_final = pd.DataFrame(organized_data)


In [61]:
df_final['human_label'] = ''
df_final['note'] = ''


In [62]:
df_final.head(1)

Unnamed: 0,transcript_to_chunk,path,url,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,transcript_annotate,human_label,note
5113,[So from the state and local perspective facin...,/chrt.fm/7a/httpschrt.fmtrackD3F8DGtraffic.meg...,https://chrt.fm/track/D3F8DG/traffic.megaphone...,news,politics,government,,,,,,,,So from the state and local perspective facing...,,


In [67]:
df_final = df_final.sample(frac=1/6, random_state=1).reset_index(drop=True)

In [69]:
df_final.to_csv('transcripts_to_annotate.csv',index=False)

In [71]:
df_final.shape

(1021, 16)

In [72]:
df_final.columns

Index(['transcript_to_chunk', 'path', 'url', 'cat1', 'cat2', 'cat3', 'cat4',
       'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'transcript_annotate',
       'human_label', 'note'],
      dtype='object')

In [73]:
df_final = df_final[df_final['transcript_annotate'].str.strip() != '']

In [74]:
df_final.shape

(1021, 16)