# Imports

In [54]:
import json
from pandas.io.json import json_normalize

import os
import regex as re
import string
import itertools

import numpy as np
from numpy import mean
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint

import nltk
from Bio import pairwise2
from Bio.pairwise2 import format_alignment

import jiwer
from jiwer import wer

import pickle

from sklearn.ensemble import RandomForestClassifier
#nltk.download('words')
import warnings
warnings.filterwarnings("ignore")

# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.width', 1000)
# pd.set_option('display.max_colwidth', -1)

In [None]:
# own functions, placed in functions.py.
from functions import *

# Import dataframe with original data

In [56]:
df = pd.read_pickle("./data/df_raw.pkl")
df.shape 
df_original = df.copy()

# Clean reference texts and hypotesis texts
 - initial volume: 546.592
 - after basic cleaning of ref and hyp: 402.992
 - after dropping Switchboard: 298.896

In [18]:
import regex as re

#------------------------------------
#       BASIC CLEANING
#------------------------------------

#lowercase both reference and hypothesis
df['reference.text'] = df['reference.text'].str.lower()
df['hypothesis.text'] = df['hypothesis.text'].str.lower()

#replace t_v_, t_v_s,  i_d_ with tv, tvs, id
df['reference.text'] = df['reference.text'].str.replace('t_v_', 'tv')
df['reference.text'] = df['reference.text'].str.replace('t_v_s', 'tvs')
df['reference.text'] = df['reference.text'].str.replace('i_d_', 'id')

#remove from reference:, ", [, ], {, }
remove_chars_list = [':', '"', '{', '}', '[', ']', '$']
pattern_remove = '|'.join(['({})'.format(re.escape(c)) for c in remove_chars_list])
df['reference.text'] = df['reference.text'].str.replace(pattern_remove, '')

# remove leading, trailing, multiple spaces
df['reference.text'] = df['reference.text'].apply(lambda x: re.sub('\s+', ' ', x).strip())
df['hypothesis.text'] = df['hypothesis.text'].apply(lambda x: re.sub('\s+', ' ', x).strip())

# remove sentences with <5 words
df = df[df['reference.text'].apply(lambda x: x.count(" ") >= 4)]


#------------------------------------
#       DROP SWITCHBOARD
#------------------------------------

df = df[df['corpus'] != 'switchboard_segmented']


#----------------------------------------------
#       DROP TEXTS WITH MORE THAN 1 SPEAKER
#----------------------------------------------
ref_texts_counts = pd.DataFrame(df['reference.text'].value_counts()).reset_index()
ref_texts_counts.columns = ['reference.text', 'counts']
unique_refs_1_speaker = ref_texts_counts[ref_texts_counts['counts']<=16]['reference.text'].unique()
df = df[df['reference.text'].isin(unique_refs_1_speaker)]

#----------------------------------------------
#       KEEP COLUMNS OF INTEREST
#----------------------------------------------

keep_columns = ['identifier', 'speaker_id', 'file', 'corpus', 'configuration', 'machine', 'reference.text', 'hypothesis.text', 'scoring.wer']
df = df[keep_columns]

In [37]:
keep_columns = ['identifier', 'speaker_id', 'file', 'corpus', 'configuration', 'machine', 'reference.text', 'hypothesis.text', 'scoring.wer', 'recomputed_wer']
df = df[keep_columns]

# From where did we loose data after cleaning?

In [39]:
df_before_cleaning = df_original['corpus'].value_counts()
df_after_cleaning = df['corpus'].value_counts()

df_comparison = pd.concat([df_before_cleaning, df_after_cleaning], axis=1, sort=False)
df_comparison.columns = ['before', 'after']
df_comparison['lost'] = ((df_comparison['before'] - df_comparison['after'])/df_comparison['before']).round(2)
df_comparison.sort_values('before', ascending = False)

Unnamed: 0,before,after,lost
rt_segmented_h,101344,48160.0,0.52
ami_segmented_h,73008,31616.0,0.57
switchboard_segmented,65680,,
commonvoice,63920,20336.0,0.68
st,61472,54912.0,0.11
librispeech_other,47024,45024.0,0.04
voxforge,46864,33200.0,0.29
librispeech_clean,41920,40544.0,0.03
timit,26880,7664.0,0.71
tedlium_segmented,18480,17440.0,0.06


# Recompute WER after the cleaning
 - the difference in **mean wer** and **mean recoputed wer** is not significant

In [31]:
#recompute wer after cleaning
import jiwer
from jiwer import wer
df['recomputed_wer'] = df.apply(lambda row: wer(row['reference.text'], row['hypothesis.text']), axis=1)

In [34]:
# average wer and recomputed wer per configuration
df.groupby(['configuration']) \
      .agg(count=('scoring.wer', 'size'), mean_wer=('scoring.wer', 'mean'), mean_recomputed_wer=('recomputed_wer', 'mean')) \
      .reset_index().sort_values('mean_wer')

Unnamed: 0,configuration,count,mean_wer,mean_recomputed_wer
13,6_glvve,18681,0.120366,0.120182
15,7_macaglge,18681,0.123906,0.123716
14,7_macaglg,18681,0.125429,0.12524
9,5_ae,18681,0.142596,0.14244
8,5_a,18681,0.142694,0.142538
7,4_iaebglebg,18681,0.206337,0.205847
6,4_iabglbg,18681,0.207524,0.207036
10,6_glvd,18681,0.221292,0.221125
11,6_glvde,18681,0.222916,0.222744
4,3_mdagmlg,18681,0.259587,0.259502


In [41]:
# average wer and recomputed wer per machine
df.groupby(['machine']) \
      .agg(count=('scoring.wer', 'size'), mean_wer=('scoring.wer', 'mean'), mean_recomputed_wer=('recomputed_wer', 'mean')) \
      .reset_index().sort_values('mean_wer')

Unnamed: 0,machine,count,mean_wer,mean_recomputed_wer
6,7,37362,0.124668,0.124478
4,5,37362,0.142645,0.142489
3,4,37362,0.206931,0.206441
5,6,74724,0.217417,0.217247
2,3,37362,0.262976,0.262889
1,2,37362,0.267763,0.267513
0,1,37362,0.451071,0.450892


# Order data by mean wer by configuration (over all corpora)

In [48]:
# get order of machines (given by avg wer recalculated over all corpora)
mean_wer_agg_recalc = df.groupby(['configuration']) \
       .agg(count=('recomputed_wer', 'size'), mean_recomputed_wer=('recomputed_wer', 'mean')) \
       .reset_index().sort_values('mean_recomputed_wer')

order = mean_wer_agg_recalc['configuration'].values

df['configuration'] = pd.Categorical(df['configuration'],categories=order)
df = df.sort_values('configuration')

# Export clean and enriched data frame

In [51]:
df.to_pickle("./data/df_clean_newwer.pkl")