In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import regex as re
import warnings
warnings.filterwarnings("ignore")
import jiwer
from jiwer import wer

from functions import *

# Loading data

In [2]:
model = pd.read_pickle("./Data/assessment_model.pkl") 
voting =  pd.read_pickle("./Data/assessment_voting.pkl") 
voting = voting[voting['reference.text'].isin(model['reference.text'].unique().tolist())]
assessment_df = pd.merge(voting, model, on='reference.text').drop(['machine_wer', 'best_machine_sentence', 'best_machine'], axis=1)
assessment_df['model_class'] = np.where((assessment_df['model_wer'] < assessment_df['min_wer']) , 'better', 
                               np.where((assessment_df['model_wer'] > assessment_df['min_wer']) , 'worse', 'same'))
assessment_df.to_pickle("./Data/assessment_comparison.pkl")

In [3]:
print(assessment_df.shape)
assessment_df.head()

(2802, 10)


Unnamed: 0,reference.text,min_wer,best_config,best_hyp,voting_result,voting_wer,voting_class,model_hypothesis,model_wer,model_class
0,it was produced on one hand by the natural gro...,0.0,[3_mdagrmlg],[it was produced on one hand by the natural gr...,it was produced on one hand by the natural gro...,0.1,worse,it was produced on one hand by the natural gro...,0.1,worse
1,the upper classes know that an army of fifty t...,0.03,"[5_ae, 5_a]",[the upper classes know that an army of fifty ...,yet the classes know that an army of fifty tho...,0.135135,worse,the vacantly classes know that an army of fift...,0.216216,worse
2,but the fatal significance of universal milita...,0.0,"[5_ae, 5_a]",[but the fatal significance of universal milit...,but the fatal significance of universal milita...,0.037037,worse,but the identify of universal military service...,0.222222,worse
3,but since this is not the case and on the cont...,0.08,"[5_ae, 5_a, 3_mdagmlg]",[but since this is not the case and on the con...,but since this is not the case and on the cont...,0.052632,better,but since this is not the case and on the cont...,0.105263,worse
4,we know now that threats and punishments canno...,0.08,[3_mdagrmlg],[we now know that threats and punishments cann...,we now know that threats and punishments canno...,0.125,worse,we now know that threats and punishment cannot...,0.125,worse


In [4]:
assessment_df['voting_class'].value_counts()

same      1271
worse     1255
better     276
Name: voting_class, dtype: int64

In [5]:
assessment_df['model_class'].value_counts()

worse     2078
same       644
better      80
Name: model_class, dtype: int64

In [6]:
model_win = assessment_df[assessment_df['voting_wer'] > assessment_df['model_wer']].shape[0]
voting_win = assessment_df[assessment_df['voting_wer'] < assessment_df['model_wer']].shape[0]
print('Voting better than model: ', voting_win)
print('Model better than voting: ', model_win)

Voting better than model:  1627
Model better than voting:  168


In [7]:
print('WER mean machines: ', assessment_df['min_wer'].mean())
print('WER mean voting  : ', assessment_df['voting_wer'].mean())
print('WER mean model   : ', assessment_df['model_wer'].mean())

WER mean machines:  0.09467166309778731
WER mean voting  :  0.14165723781358147
WER mean model   :  0.22380283015956318


# Test prediction Analysis

In [8]:
train = pd.read_pickle("./Data/alignment_with_ref.pkl") 
test = pd.read_pickle("./Data/alignment_test.pkl") 

le, col = word_encoding(train, test)
train[col] = train[col].apply(lambda x :le.transform(x))

In [9]:
test_df = pd.read_pickle("./Data/Test_predictions_final(full)_pickl.pkl")

In [10]:
thresh = 50
test_df_transformed = prediction_data_preparation(test_df, le, thresh)

100%|█████████████████████████████████████████████████████████████████████████| 43316/43316 [00:02<00:00, 20502.77it/s]


In [11]:
sentences = test_df['reference.text'].unique().tolist()
predictions_sentences, selected_sentences = sentence_probability(sentences, test_df_transformed)

100%|██████████████████████████████████████████████████████████████████████████████| 2802/2802 [00:35<00:00, 78.19it/s]


In [12]:
# example rows where model performs better then best ASR machines
model_better_than_min_wer = assessment_df[assessment_df['model_wer'] < assessment_df['min_wer']].index.tolist()

In [20]:
example_sentence = 60
probability_analysis(sentences, predictions_sentences, selected_sentences, example_sentence, assessment_df)

Ground truth: 
the rector did not ask for a catechism to hear the lesson from

Best Machine: 
the rector did not ask for a catechism to hear the lesson from
-----------------------------------------------------------------------------
WER Best Machine:    0.0
-----------------------------------------------------------------------------

Voting: 
director did not ask for a catechism to hear the lesson from
-----------------------------------------------------------------------------
WER Voting:          0.15384615384615385
-----------------------------------------------------------------------------

Model prediction: 
desegregation did not ask for a to hear the lesson from
-----------------------------------------------------------------------------
WER model:          0.23076923076923078
-----------------------------------------------------------------------------

50% treshold: 
_ did not ask for a to hear the lesson from
--------------------------------------------------------------

In [17]:
#test_df_transformed[test_df_transformed['reference.text'] == 'to the fervent latter day saint a temple is not simply a church building a house for religious assembly']

# Creating comparison dataframe

In [18]:
truth = []
best_machine = []
machine_wer = []
model_prediction = []
model_wer = []
thresh = []
thresh_wer = []

for x in tqdm(range(len(sentences))):
    best_machine.append(assessment_df[assessment_df['reference.text'] == sentences[x]]['best_hyp'].tolist()[0][0])    
    truth.append(sentences[x])
    machine_wer.append(wer(sentences[x], best_machine))
    model_prediction.append(predictions_sentences[x])
    model_wer.append(wer(sentences[x], predictions_sentences[x]))
    thresh.append(selected_sentences[x])
    thresh_wer.append(wer(sentences[x], selected_sentences[x]))

compare = pd.DataFrame()
compare['truth'] = truth
compare['machine'] = best_machine
compare['machine_wer'] = machine_wer
compare['model_prediction'] = model_prediction
compare['model_wer'] = model_wer
compare['selected'] = thresh
compare['selected_wer'] = thresh_wer

100%|██████████████████████████████████████████████████████████████████████████████| 2802/2802 [43:24<00:00,  1.08it/s]


In [19]:
#compare.to_pickle(path + "/assessment_model(with_thres).pkl")

In [17]:
compare = pd.read_pickle("./Data/assessment_model(with_thres).pkl") 
compare.head()

Unnamed: 0,truth,machine,machine_wer,model_prediction,model_wer,selected,selected_wer
0,it was produced on one hand by the natural gro...,it was produced on one hand by the natural gro...,0.0,it was produced on one hand by the natural gro...,0.1,it was produced on one hand by the natural gro...,0.15
1,the upper classes know that an army of fifty t...,the upper classes know that an army of fifty t...,0.567568,the vacantly classes know that an army of fift...,0.216216,the _ _ know that an army of fifty thousand wi...,0.27027
2,but the fatal significance of universal milita...,but the fatal significance of universal milita...,2.111111,but the identify of universal military service...,0.222222,but the _ of universal military service as the...,0.222222
3,but since this is not the case and on the cont...,but since this is not the case and on the cont...,2.289474,but since this is not the case and on the cont...,0.105263,_ _ this is not the case and on the contrary m...,0.157895
4,we know now that threats and punishments canno...,we now know that threats and punishments canno...,5.125,we now know that threats and punishment cannot...,0.125,we now know that _ and _ _ diminish their numb...,0.291667


In [18]:
print('WER mean machines: ', assessment_df['min_wer'].mean())
print('WER mean voting  : ', assessment_df['voting_wer'].mean())
print('WER mean model   : ', assessment_df['model_wer'].mean())
print('WER mean model   : ', compare.selected_wer.mean())

WER mean machines:  0.09467166309778731
WER mean voting  :  0.14165723781358147
WER mean model   :  0.22380283015956318
WER mean model   :  0.30564245666530276
