In [52]:
import pandas as pd
import numpy as np 
from gensim import models
from lib import weat
from lib import weat_main
import os
import scipy
import matplotlib.pyplot as plt
import importlib
from scipy import stats
import seaborn as sns
import fasttext
import fasttext.util
from sklearn.model_selection import train_test_split
import wave

In [None]:
# define the paths 
TXT_FILE = '/Users/adimaini/Documents/GW/Research/CODE.nosync/WEAT-WEFAT/data/cmu_haitian_speech/etc/txt.done.data'
PATH_TO_WAV = '/Users/adimaini/Documents/GW/Research/CODE.nosync/WEAT-WEFAT/data/cmu_haitian_speech/wav/'
EXPORT_PATH = '/Users/adimaini/Documents/GW/Research/CODE.nosync/WEAT-WEFAT/data/mozilla_deepspeech/cmu/'

### Read the CMU speech transcripts

In [9]:
def process_cmu(txt_file):
    '''process the cmu speech dataset to create the transcripts for AWS Transcribe'''
    transcript = pd.read_csv(txt_file, sep='"', header=None).drop(columns=[2])
    transcript.columns = ['id', 'text']
    
    # clean out the 'id' and 'text' column spacing and extra paranthesis
    transcript['id'] = transcript['id'].apply(lambda x: x.lstrip('( ').strip())
    transcript['text'] = transcript['text'].apply(lambda x: x.lstrip().strip())
    
    # shuffle the transcripts to allow for random speakers for our accuracy sub sample
    transcript = transcript.sample(frac=1, random_state=5)
    
    return transcript

In [23]:
def cmu_mozilla(transcript, path_to_wav):
    '''process the CMU speech dataset to train on Mozilla DeepSpeech'''
    transcript.rename(columns={'id': 'wav_filename', 'text': 'transcript'}, inplace=True)
    transcript['wav_filename'] = path_to_wav + transcript['wav_filename'].astype(str) + '.wav'
    transcript['wav_filesize'] = [os.path.getsize(file) for file in transcript['wav_filename']]
    
    col_order = ['wav_filename', 'wav_filesize', 'transcript']
    transcript = transcript[col_order]
    return transcript

In [66]:
def test_train_val_split(df, export_path): 
    '''split the transcript df into 3 CSVs for test, train, and validation'''

    train, validate, test = np.split(df.sample(frac=1, random_state=42), [int(.6*len(df)), int(.8*len(df))])

    # exports the csv's 
    for index, file in enumerate([train, validate, test]): 
        file_path = export_path + str(index) + '.csv'
        file.to_csv(file_path)
        
    return train, test, validate

In [67]:
# process the CMU transcipts for deepspeech and export the files 
transcript = process_cmu(TXT_FILE)
transcript_mo = cmu_mozilla(transcript, PATH_TO_WAV)
train, test, val = test_train_val_split(transcript_mo, EXPORT_PATH)

### Read the CMU audio files and order them in the same fashion as the transcript data

In [69]:
# Read the CMU audio files and order them in the same fashion as the transcript data
dir_file = os.listdir(path=PATH_TO_WAV)

In [70]:
dir_file

['dyfw_n03.wav',
 'defw_a02.wav',
 'ewmw_i15.wav',
 'ELMW_4K15.wav',
 'BWFW_3G21.wav',
 'CYMW_2I21.wav',
 'ewmw_i01.wav',
 'ELMW_4K01.wav',
 'CVFW_2S18.wav',
 'dyfw_n17.wav',
 'defw_a16.wav',
 'BHFW_JJ16.wav',
 'BFMS_4J17.wav',
 'ECMN_3A03.wav',
 'ERFS_4O22.wav',
 'cjfw_k06.wav',
 'BOFW_3Z11.wav',
 'BOFW_3Z05.wav',
 'BFMS_4J03.wav',
 'ECMN_3A17.wav',
 'cjfw_k12.wav',
 'BHFW_JJ02.wav',
 'CVFW_2S24.wav',
 'BWFW_3G09.wav',
 'CYMW_2I09.wav',
 'ERMN_2I02.wav',
 'CKMW_2N01.wav',
 'fcmx_x20.wav',
 'CTFW_3Q04.wav',
 'CAMW_3N19.wav',
 'DDFN_2N21.wav',
 'new-U16.wav',
 'CSMW_3P13.wav',
 'EHFW_3H13.wav',
 'MRAB0002_g14.wav',
 'CSMW_3P07.wav',
 'EHFW_3H07.wav',
 'new-U02.wav',
 'DBMW_3L19.wav',
 'CTFW_3Q10.wav',
 'EQME_4N08.wav',
 'ERMN_2I16.wav',
 'BGFW_MM24.wav',
 'CKMW_2N15.wav',
 'EQME_4N20.wav',
 'bemw_a22.wav',
 'CAMW_3N25.wav',
 'DDFN_2N09.wav',
 'BGFW_MM18.wav',
 'fcmx_x08.wav',
 'DBMW_3L25.wav',
 'EYMW_2Q09.wav',
 'ffmn_y07.wav',
 'EMMW_4J19.wav',
 'BIFW_3i13.wav',
 'dxmw_p13.wav',
 'BIFW

In [7]:
# output the combined .wav files. in order to match the duration of the radio transcription, 
# we will only select 200 transcripts (31 minutes) of the transcripts at first. 

outfile = 'data/cmu_haitian_speech/aws/cmu_combined.wav'
no_samples = 180

data = []
for t_id in transcript.id[:no_samples]: 
    file = path_to_wav + t_id + '.wav'
    w = wave.open(file, 'rb')
    data.append( [w.getparams(), w.readframes(w.getnframes())] )
    w.close()
    
output = wave.open(outfile, 'wb')
output.setparams(data[0][0])
for i in range(len(data)): 
    output.writeframes(data[i][1])
output.close()

In [8]:
text = ' '.join(transcript[:no_samples].text.values)

In [9]:
text

'Konsa tou, sa nou wè koulye a, se tankou yon pòtrè nap gade yon jan twoub twoub nan yon glas. Avantaj baraj Pelig la t ap bay peyizan Latibonit yo chans pou jwenn dlo pandan tout sezon an. Kouman ou ka aji ak moun tankou si yo te pwason nan lanmè, tankou koulèv nan raje ki pa gen mèt? Parabòl ja lajan an Peyi Wa ki nan syèl la, se tankou yon ja lajan ki te anba tè nan yon jaden. Aprè sa, li pale avèk sa ki sou bò gòch li yo, li di: wete kò nou sou mwen, nou menm ki gen madichon. Epi li rete konsa li wè gwo lafimen ap moute sòti nan tè a tankou lafimen k ap leve nan yon gwo founo dife. Epi li rete konsa li wè gwo lafimen ap moute sòti nan tè a tankou lafimen k ap leve nan yon gwo founo dife. Se pou l tounen dènye klas esklav frè l yo. Lè sa a, se wa Akaz, pitit gason Jotam, pitit pitit gason Ozyas, ki t ap gouvène nan peyi Jida. Se li ki sèvi m ranpa, se li ki tout delivrans mwen, se li ki pwoteksyon mwen, se anba zèl li mwen kache. Dirijan politik lavalas yo tankou bon koutye malfektè

In [10]:
import json

In [11]:
aws_file = '/Users/adimaini/Documents/GW/Research/CODE.nosync/WEAT-WEFAT/data/cmu_haitian_speech/aws/asrOutput.json'
f = open(aws_file)

aws_output = json.load(f)

In [12]:
aws_transcript = aws_output['results']['transcripts'][0]['transcript']

In [134]:
!pip install jiwer

Defaulting to user installation because normal site-packages is not writeable
Collecting jiwer
  Downloading jiwer-2.2.0-py3-none-any.whl (13 kB)
Collecting python-Levenshtein
  Downloading python-Levenshtein-0.12.2.tar.gz (50 kB)
[K     |████████████████████████████████| 50 kB 858 kB/s eta 0:00:01
Building wheels for collected packages: python-Levenshtein
  Building wheel for python-Levenshtein (setup.py) ... [?25ldone
[?25h  Created wheel for python-Levenshtein: filename=python_Levenshtein-0.12.2-cp37-cp37m-macosx_10_14_x86_64.whl size=81520 sha256=e4784d8f9d1791bbbff98460ad0c502217f1e9e506a05a7b4c0c30aaab046a63
  Stored in directory: /Users/adimaini/Library/Caches/pip/wheels/05/5f/ca/7c4367734892581bb5ff896f15027a932c551080b2abd3e00d
Successfully built python-Levenshtein
Installing collected packages: python-Levenshtein, jiwer
Successfully installed jiwer-2.2.0 python-Levenshtein-0.12.2
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m 

In [13]:
from jiwer import wer
error = wer(text, aws_transcript)

In [14]:
error

0.988116458704694

In [30]:
len(text)

15812

In [31]:
text

'Konsa tou, sa nou wè koulye a, se tankou yon pòtrè nap gade yon jan twoub twoub nan yon glas. Avantaj baraj Pelig la t ap bay peyizan Latibonit yo chans pou jwenn dlo pandan tout sezon an. Kouman ou ka aji ak moun tankou si yo te pwason nan lanmè, tankou koulèv nan raje ki pa gen mèt? Parabòl ja lajan an Peyi Wa ki nan syèl la, se tankou yon ja lajan ki te anba tè nan yon jaden. Aprè sa, li pale avèk sa ki sou bò gòch li yo, li di: wete kò nou sou mwen, nou menm ki gen madichon. Epi li rete konsa li wè gwo lafimen ap moute sòti nan tè a tankou lafimen k ap leve nan yon gwo founo dife. Epi li rete konsa li wè gwo lafimen ap moute sòti nan tè a tankou lafimen k ap leve nan yon gwo founo dife. Se pou l tounen dènye klas esklav frè l yo. Lè sa a, se wa Akaz, pitit gason Jotam, pitit pitit gason Ozyas, ki t ap gouvène nan peyi Jida. Se li ki sèvi m ranpa, se li ki tout delivrans mwen, se li ki pwoteksyon mwen, se anba zèl li mwen kache. Dirijan politik lavalas yo tankou bon koutye malfektè

In [15]:
aws_transcript

"Sato, ça nous est que Houllier, c'est un couillon poterie. La brigade ayant jeune troupe tout non avantage, bar espiègle à tabac Et puis Zola, Thibault, Nico Chance. Plus besoin de l'eau pour la toutes saisons comme en UK a agi à comment on s'y déploie. Son armée d'un beaucoup les visages équipages Ah après ça lit pas aller avec ça qui sous bogoss Lio Midi où étaient couenne sous moins nous mêmes qu'il aime à Dijon. Et puis il était con ça le we go la fumée a pu noter saute si noter d'un coup la fumée qu'appeler Wayne en gros faune aux effets. Et puis il était concerne les oui voilà filmé à monter soi Tina There than Colas filmé câblée Wedding of How c'est pour donner des des classes risquent la foule Lézards c'est waaa cas petit gars songent à me petit à petit garçon Ozias, quitte à gouverner un pays, c'est lui qui savait moins pas, c'est lui qui tout délie. Passe moi c'est l'équipe au Texas, au moins c'est en bas Et les mots cachés Division politique Lavalas d'un coup, Bon coup Thie

In [17]:
df = pd.read_csv('/Users/adimaini/Documents/GW/Research/CODE.nosync/WEAT-WEFAT/data/Bellezza Translations')

In [29]:
count = 0
for word in df['Haitian Creole word']: 
    if word in text.split(' '): 
        count += 1
count

69

In [23]:
len(aws_transcript.split(' '))

3255