In [1]:
#run if working dir currently same dir as notebook

In [1]:
cd ../asr/data/asr_dataset

/project/graziul/ra/wdolan/lang-of-pol/asr/data/asr_dataset


## Data Loading
author: Eric Chandler

In [1]:
# Uncomment if the cells aren't printing stuff
# import logging
# root_logger = logging.getLogger()
# root_logger.setLevel(logging.DEBUG)
# root_logger.addHandler(logging.StreamHandler())

In [2]:
from asr_dataset.police import BpcETL, AmbiguityStrategy
from asr_dataset.constants import DataSizeUnit, Cluster

cluster = Cluster['RCC']
etl = BpcETL(cluster, filter_inaudible=False, filter_uncertain=False, filter_numeric=False, ambiguity=AmbiguityStrategy.ALL)
# This should NOT throw errors about PySoundFile etc. 
# Sometimes it works sometimes it doesn't... I really hate midway :(
data = etl.etl()

## Transcript Labeling
author: Eric Chandler

In [3]:
# Label data as good/bad
data = data.assign(inaudible = data['text'].str.contains('|'.join(etl.BAD_WORDS), regex=True, case=False),
                    uncertain = lambda x: ~x['inaudible'] & x['text'].str.contains('\[.+\]', regex=True),
                    clean = lambda x: ~x['inaudible'] & ~x['uncertain'])

f"{data['inaudible'].sum()} inaudible and {data['uncertain'].sum()} uncertain and {data['clean'].sum()} clean"

'7907 inaudible and 10099 uncertain and 41013 clean'

In [4]:
labeler = {'clean': 0, 'uncertain': 1, 'inaudible': 2}
unlabeler = {0: 'clean', 1:'uncertain', 2:'inaudible'}

In [5]:
import numpy as np
import pandas as pd

In [6]:
data['label'] = pd.Series(np.zeros(len(data)))
data.loc[data['inaudible'], 'label'] = labeler['inaudible']
data.loc[data['clean'], 'label'] = labeler['clean']
data.loc[data['uncertain'], 'label'] = labeler['uncertain']

## Content analysis
william dolan

In [7]:
def clean_string(string):
    return string.replace('[','').replace(']','').replace('<X>', '')

groups = data.groupby('label')
clean = groups.get_group(0.0).copy()
uncertain = groups.get_group(1.0).copy()
inaudible = groups.get_group(2.0).copy()

uncertain['text'] = uncertain['text'].apply(clean_string)
inaudible['text'] = inaudible['text'].apply(clean_string)

In [8]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

stopwords = set(STOPWORDS)
stopwords.update(["ZERO", "ONE", "TWO", "THREE", "FOUR", "FIVE", "SIX", "SEVEN", "EIGHT", "NINE", "TEN"])
stopwords.update(["TWENTY", "THIRTY", "FORTY", "FIFTY", "SIXTY", "SEVENTY", "EIGHTY", "NINETY", "FOURTY"])
stopwords.update(["ELEVEN", "TWELVE", "THIRTEEN", "FOURTEEN", "FIFTEEN", 
                  "SIXTEEN", "SEVENTEEN", "EIGHTEEN", "NINETEEN"])

In [9]:
clean_text = "".join(text for text in clean['text'])
uncertain_text = "".join(text for text in uncertain['text'])
inaudible_text = "".join(text for text in inaudible['text'])

clean_wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(clean_text)
uncertain_wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(uncertain_text)
inaudible_wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(inaudible_text)

In [11]:
#uncomment to visualize wordcloud
#plt.imshow(clean_wordcloud, interpolation='bilinear')
#plt.axis("off")
#plt.show()

In [12]:
#uncomment to visualize wordcloud
#plt.imshow(uncertain_wordcloud, interpolation='bilinear')
#plt.axis("off")
#plt.show()

In [13]:
#uncomment to visualize wordcloud
#plt.imshow(inaudible_wordcloud, interpolation='bilinear')
#plt.axis("off")
#plt.show()

In [14]:
masculine_words = ["MAN", "HE", "HIM", "HIS", "MALE", "SON", "FATHER", "HE'S", "HE'LL", "HIMSELF",
                   "HE'D", "BOYFRIEND", "UNCLE", "MASCULINE", "BOY", "MEN", "BOYS", "MALES"]
feminine_words = ["WOMAN", "SHE", "HER", "HERS", "FEMALE", "DAUGHTER", "MOTHER", "SHE'S", "SHE'LL", "HERSELF",
                  "SHE'D", "GIRLFRIEND", "AUNT", "FEMININE", "GIRL", "WOMEN", "GIRLS", "FEMALES"]
gendered_words = masculine_words + feminine_words

#TODO get placenames from csv?
place_words = ["LOCATION", "AT", "IN"]

In [15]:
def topic_freq(corp_string, topic):
    topic_word_count = 0
    corp_words = corp_string.split()
    for word in corp_words:
        if word in topic:
            topic_word_count = topic_word_count + 1
    return (topic_word_count / len(corp_words))


#TODO get placenames from csv?
def place_freq(corp_string):
    place_word

print("confident gendered words freq: ", topic_freq(clean_text, gendered_words))
print("uncertain gendered words freq: ", topic_freq(uncertain_text, gendered_words))
print("inaudible gendered words freq: ", topic_freq(inaudible_text, gendered_words))
print("confident place words freq: ", topic_freq(clean_text, place_words))
print("uncertain place words freq: ", topic_freq(uncertain_text, place_words))
print("inaudible place words freq: ", topic_freq(inaudible_text, place_words))

confident gendered words freq:  0.01677860238395649
uncertain gendered words freq:  0.019337135943306542
inaudible gendered words freq:  0.019256245268735806
confident place words freq:  0.01925503318887487
uncertain place words freq:  0.025332728372655777
inaudible place words freq:  0.028269303557910675
