## Pipeline for Modeling Speech Recognition Error using NLP and RNN Architecture

To summarize the modeling pipeline, here the major steps 

1. Data Mining: Speech Recognition of Clean and Noisy Audio
2. Data Cleaning: Textual Processing using NLP
3. Data Wrangling: Speech Error Detect
4. Data Selection: Filtering Error Detect Samples
5. Data Preparation: Subsampling and Hot Encoding
6. Prediction Modelling: RNN and Evaluation
7. Output Prediction 

In [0]:
######################################################################################################
################     Data Mining: Speech Recognition of Clean and Noisy Audio     ####################
######################################################################################################

# Function for transcripting clear speeches
def audio_transcript(loc):
  harvard = sr.AudioFile(loc)
  with harvard as source:
    audio = r.record(source)
    return r.recognize_google(audio)


# Function for transcripting noisy speeches
def audio_transcript_all(loc):
  harvard = sr.AudioFile(loc)
  with harvard as source:
    audio = r.record(source)
    a = r.recognize_google(audio, show_all=True)
    if not a:
      return []
    else:
      c = [inx['transcript'] for inx in a['alternative']]
      return c

# Clean and Noise audio clip speech to text conversion
cnt= 0 
for wavfil in trainset_df['file_name']:
  cnt  = cnt + 1
  cond = trainset_df['file_name'] == wavfil
  cln_soundfile = '/content/gdrive/My Drive/clean_trainset/clean_trainset_28spk_wav/'+str(wavfil)+'.wav'
  noi_soundfile = '/content/gdrive/My Drive/noisy_trainset/noisy_trainset_28spk_wav/'+str(wavfil)+'.wav'
  if not audio_transcript(cln_soundfile):
    print ("Empty Clean Speech Found")
  else:
    clean_mat = audio_transcript(cln_soundfile)
    trainset_df.loc[cond,'clean_speech'] = clean_mat

  noise_mat = audio_transcript_all(noi_soundfile)
  cnt1 = 0
  for elem in noise_mat:
    if (cnt1 < 5):
      col = 'noisy_speech_Out'+str(cnt1+1)
      trainset_df.loc[cond,col] = elem
      cnt1 = cnt1 + 1
  print("Number of Speech trancripted to text {}".format(cnt))

# Example of empty clean and noise speech translations
trainset_df[trainset_df['clean_speech'].isnull()]

# Drop empty field in clean speech category
cond_clean = ~trainset_df['clean_speech'].isnull()
trainset_edit_df = trainset_df[cond_clean]
trainset_edit_df = trainset_edit_df.reset_index()
trainset_edit_df = trainset_edit_df.drop(['index'],axis=1)
trainset_edit_df

######################################################################################################
################     Data Cleaning: Textual Processing using NLP     #################################
######################################################################################################

# to convert numbers in words
!pip install inflect
import inflect
p = inflect.engine()

# Spacy Creating Document
nlp = spacy.load('en')

# all the processing work is done below, so it may take a while
for rev in noise_output_df['clean_speech']:
  text =  rev.replace("\'","").replace("/"," by ")
  review_doc = nlp(text)
  c = []
  for token in review_doc:
    if not token.is_punct:
      if not token.is_space:
        if token.is_alpha:
          c.append(token.lower_)
        elif token.is_digit:
          c.append(p.number_to_words(token))
  noise_output_df.loc[noise_output_df['clean_speech'] == rev,'clean_speech_edit']= " ".join(c)

# all the processing work is done below, so it may take a while
for rev in noise_output_df['noisy_speech']:
  text =  rev.replace("\'","").replace("/"," by ")
  review_doc = nlp(text)
  c = []
  for token in review_doc:
    if not token.is_punct:
      if not token.is_space:
        if token.is_alpha:
          c.append(token.lower_)
        elif token.is_digit:
          c.append(p.number_to_words(token))
  noise_output_df.loc[noise_output_df['noisy_speech'] == rev,'noisy_speech_edit']= " ".join(c)

######################################################################################################
################     Data Wrangling: Speech Error Detect                             #################
######################################################################################################

def word_mismatch2(mat1,mat2, nb):
  totn_mat1 = []
  disn_mat1 = [] 
  # Find similar words
  cnt = 0
  for ap in mat1:
    lim1 = max(0,cnt-nb)
    lim2 = min(cnt+nb,len(mat2))
    totn_mat1.append([ind for ind in mat2[lim1:lim2] if (ind == ap)])
    cnt = cnt + 1
  #print(totn_mat1)

  # Filter mismatched Words
  cnt = 0
  misnum = 0
  for at in totn_mat1:
    if (at == []):
      disn_mat1.append(mat1[cnt])
      misnum = misnum + 1
    else:
      disn_mat1.append('xxxxxx')
    cnt = cnt + 1

  outp = " ".join(disn_mat1)
  if len(outp) == 0 :
    outp = 'xxxxxx'
  return {'sent': outp, 'num_word' : misnum}

# clean / noisy edit
for ind in range(0,noise_output_df.shape[0]):
  a_mat = [token.lower_ for token in nlp(noise_output_df.iloc[ind]['clean_speech_edit'])]
  b_mat = [token.lower_ for token in nlp(noise_output_df.iloc[ind]['noisy_speech_edit'])]
  noise_output_df.loc[ind,'clean_speech_edit_detect'] = word_mismatch2(a_mat,b_mat,2)['sent']
  noise_output_df.loc[ind,'noisy_speech_edit_detect'] = word_mismatch2(b_mat,a_mat,2)['sent']
  noise_output_df.loc[ind,'clean_speech_edit_detect_num'] = word_mismatch2(a_mat,b_mat,2)['num_word']
  noise_output_df.loc[ind,'noisy_speech_edit_detect_num'] = word_mismatch2(b_mat,a_mat,2)['num_word']

######################################################################################################
################     Data Selection: Filtering Error Detect Samples                  #################
######################################################################################################

# Choose instances where only one word was errored
trainset_noise_df = trainset_noise_df[(noise_output_df['noisy_speech_edit_detect_num'] == 1) & (noise_output_df['clean_speech_edit_detect_num'] == 1) ]
trainset_noise_df = trainset_noise_df.reset_index()
trainset_noise_df = trainset_noise_df.drop(['index'],axis=1)

######################################################################################################
################     Data Prep: Subsampling and Hot Encoding         #################################
######################################################################################################

def input_split(text, veclen):
  inpmat = []
  nlp_text = [token.lower_ for token in nlp(text)]
  nlp_text = ["#"]+nlp_text+["#"]
  for ind in range(0,veclen):
    if ind < (len(nlp_text)-1):
      if (nlp_text[ind] != '#'):
        inpmat.append(" ".join([nlp_text[ind-1],nlp_text[ind],nlp_text[ind+1]]))
  return inpmat

def output_mat_split(text, origtext, veclen):
  outmat = []
  nlp_text     = [token.lower_ for token in nlp(text)]
  nlp_origtext = [token.lower_ for token in nlp(origtext)]
  nlp_text = ["#"]+nlp_text+["#"]
  nlp_origtext = ["#"]+nlp_origtext+["#"]
  for ind in range(0,veclen):
    if ind < (len(nlp_text)-1):
      if (nlp_text[ind] != '#'):
        if ((((nlp_text[ind-1] == 'xxxxxx') | (nlp_text[ind-1] == '#')) & (nlp_text[ind] != 'xxxxxx') & ((nlp_text[ind+1] == 'xxxxxx') | (nlp_text[ind+1] == '#')))):
          outmat.append([0,0,1,0])
        elif ((((nlp_text[ind-1] != 'xxxxxx') & (nlp_text[ind-1] != '#')) & (nlp_text[ind] == 'xxxxxx') & ((nlp_text[ind+1] == 'xxxxxx') | (nlp_text[ind+1] == '#')))):
          outmat.append([0,1,0,0])
        elif ((((nlp_text[ind-1] == 'xxxxxx') | (nlp_text[ind-1] == '#')) & (nlp_text[ind] == 'xxxxxx') & ((nlp_text[ind+1] != 'xxxxxx') & (nlp_text[ind+1] != '#')))):
          outmat.append([0,0,0,1])
        elif ((((nlp_text[ind-1] == 'xxxxxx') | (nlp_text[ind-1] == '#')) & (nlp_text[ind] != 'xxxxxx') & ((nlp_text[ind+1] != 'xxxxxx') & (nlp_text[ind+1] != '#')))):
          outmat.append([num/2.0 for num in [0,0,1,1]])
        elif ((((nlp_text[ind-1] != 'xxxxxx') & (nlp_text[ind-1] != '#')) & (nlp_text[ind] == 'xxxxxx') & ((nlp_text[ind+1] != 'xxxxxx') & (nlp_text[ind+1] != '#')))):
          outmat.append([num/2.0 for num in [0,1,0,1]])
        elif ((((nlp_text[ind-1] != 'xxxxxx') & (nlp_text[ind-1] != '#')) & (nlp_text[ind] != 'xxxxxx') & ((nlp_text[ind+1] == 'xxxxxx') | (nlp_text[ind+1] == '#')))):
          outmat.append([num/2.0 for num in [0,1,1,0]])
        elif ((((nlp_text[ind-1] != 'xxxxxx') & (nlp_text[ind-1] != '#')) & (nlp_text[ind] != 'xxxxxx') & ((nlp_text[ind+1] != 'xxxxxx') & (nlp_text[ind+1] != '#')))):
          outmat.append([num/3.0 for num in [0,1,1,1]])
        else:
          outmat.append([1,0,0,0])
  return outmat

# Preparing three string inputs for Neural Network using Subsampling
nn_input_mat = []
for i,_ in enumerate(trainset_noise_df['noisy_speech_edit']):
  orig_text   = trainset_noise_df.iloc[i]['noisy_speech_edit']
  detect_text = trainset_noise_df.iloc[i]['noisy_speech_edit_detect']
  nn_input_mat  = np.append(nn_input_mat,input_split(orig_text,100), axis = 0)

# Preparing dummy array output for Neural Network using Subsampling
nn_output_loc_mat = []
for i,_ in enumerate(trainset_noise_df['noisy_speech_edit']):
  orig_text   = trainset_noise_df.iloc[i]['noisy_speech_edit']
  detect_text = trainset_noise_df.iloc[i]['noisy_speech_edit_detect']
  if (i == 0) :
    nn_output_loc_mat = output_mat_split(detect_text,orig_text,100)
  else :
    nn_output_loc_mat = np.append(nn_output_loc_mat,output_mat_split(detect_text,orig_text,100), axis = 0)

from keras.preprocessing import text
from keras.preprocessing import sequence
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.

tokenizer_inp = text.Tokenizer(num_words=MAX_NB_WORDS, filters='!"$%&()*+,-/:;<=>?@[\]^_`{|}~', lower=True)
#tokenizer.fit_on_texts(np.append(rhym_comb, targ_comb, axis=0))
tokenizer_inp.fit_on_texts(nn_input_mat)
word_index_inp = tokenizer_inp.word_index
print('Found %s unique tokens.' % len(word_index_inp))

MAX_SEQUENCE_LENGTH = 3
X = tokenizer_inp.texts_to_sequences(nn_input_mat)
X = sequence.pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

#X = X/len(word_index_inp)
X=X.astype('float')
Y= nn_output_loc_mat#.astype('float')

print('Shape of data X tensor:', X.shape)
print('Shape of data Y tensor:', Y.shape)

######################################################################################################
################       Prediction Modelling : RNN Modelling and Evaluation            ################
###################################################################################################### 

import tensorflow as tf
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, SpatialDropout1D
from keras import optimizers

# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 6000
# This is fixed.
EMBEDDING_DIM = 16
# Number of Epoch to Test
epochs = 10
# Batch Size of NN
batch_size = 320

# Defining the NN Model Architecture (Learning Rate : 0.01)
model_pl = tf.keras.Sequential([
    tf.keras.layers.Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]),
    tf.keras.layers.SpatialDropout1D(0.2),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(8)),
    tf.keras.layers.Dense(4, activation='softmax')
])
adam = optimizers.Adam(learning_rate=0.01)
model_pl.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_pl.summary()

# Fitting the Model on the current data
history_p1 = model_p1.fit(X, Y, batch_size=batch_size, epochs=epochs, shuffle = True, validation_split=0.1, verbose=1)

######################################################################################################
################     Output Prediction                               #################################
######################################################################################################

def pred_conv(input,nnmodel):
  for i,text in enumerate(input):
      nlp_text = [token.lower_ for token in nlp(text)]
      nlp_text = ["#"]+nlp_text+["#"]
      inpmat = []
      txtmat = []
      for ind in range(1,len(nlp_text)-1):
        inpmat.append(" ".join([nlp_text[ind-1],nlp_text[ind],nlp_text[ind+1]]))
        txtmat.append(nlp_text[ind])
      #print(inpmat)
      
      # Sequence tokenizing
      MAX_SEQUENCE_LENGTH = 3
      token_inpmat = tokenizer_inp.texts_to_sequences(inpmat)
      token_inpmat = sequence.pad_sequences(token_inpmat, maxlen=MAX_SEQUENCE_LENGTH)
      #print(token_inpmat)
      prb_outmat = nnmodel.predict(token_inpmat)
      #print(prb_outmat)
      outmat = [np.argmax(submat) for submat in prb_outmat]
      #print(txtmat)
      #print(outmat)
      cnt_word = []
      fnl_mat = []
      for i,rt in enumerate(outmat) :
        #print(i)
        if (rt == 3):
          cnt_word.append(nlp_text[i+2])
        elif (rt == 2):
          cnt_word.append(nlp_text[i+1])
        elif (rt == 1):
          cnt_word.append(nlp_text[i])
      uniq_cnt_word = np.unique(cnt_word)      
      for wrd in uniq_cnt_word:
        #print(text,"\t---------->\t",wrd," with probability ",round(cnt_word.count(wrd)/3,2))
        fnl_mat.append({wrd : round(cnt_word.count(wrd)/3,2)})
      if len(uniq_cnt_word) == 0 :
        fnl_mat = ["NO ERROR DETECTED"]
      print(text,"\t---------->\t",fnl_mat)
  return 
