# Table Of Contents<a class="anchor" id="zero-bullet"></a>:
* [Imports](#first-bullet)
* [Global Settings](#second-bullet)
* [Import Validation Data](#third-bullet)
* [Create LHS](#fourth-bullet)
* [Analysis](#fifth-bullet)
* [Analyze Weights](#sixth-bullet)

## Imports<a class="anchor" id="first-bullet"></a>

[Back to the Table of Contents](#zero-bullet)

In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import numpy as np
import time

from model_parser import create_model
import torch
from scipy.stats import expon
import torchvision.transforms.functional as F
from torch.utils.data import DataLoader

import json

from utils import *

  warn(f"Failed to load image Python extension: {e}")
The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.
The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.


# Global Settings<a class="anchor" id="second-bullet"></a>

[Back to the Table of Contents](#zero-bullet)

In [2]:
# Init Tacotron2

run_opts = {"device": "cuda","data_parallel_count": -1,"data_parallel_backend": False,"distributed_launch": False,"distributed_backend": "nccl","jit_module_keys": None}

tacotron2 = Tacotron2_modifyed.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="tmpdir_tts", run_opts=run_opts)

## Import validation dataset<a class="anchor" id="third-bullet"></a>

[Back to the Table of Contents](#zero-bullet)

We use different labels for RHS_1 and RHS_2 for simplicity of the training. Thus, the validation dataset has both labels. Please specity the labels of which RHS you want to work with

In [3]:
# Chose what RHS you want to import and work with
RHS = 1

In [4]:
root = fr'C:\Users\Oleg\Documents\Masterarbeit\STT\RHS_{RHS}'

In [5]:
val_df = pd.read_csv('validation_dataset.csv')

In [6]:
val_df.head()

Unnamed: 0,original,mistake_1,mistake_2,mistake_3,mistake_4,label_2,label_1
0,Sigglekow,Siggelkow,Sigglecow,Siglekow,Sigelkow,8144,830
1,Rozwaski,Rozwoski,Rozwasky,Roswaski,Rozwaski,7569,769
2,Maingain,Mainchain,Maingane,Mainagan,Maingainn,5441,570
3,Doqaj,Dokaaj,Doqja,Docaaj,Dohqaj,2309,252
4,Tamômura,Tamomura,Tomamura,Tamoemura,Tamooura,8710,878


In [7]:
with open(root + r'\labels_dict.json', 'r', encoding='utf-8') as ld:
     label_dict = json.loads(ld.read())

Inspect labels dictionary

In [8]:
label_dict

{'Sigglekow': 830,
 'Rozwaski': 769,
 'Maingain': 570,
 'Doqaj': 252,
 'Tamômura': 878,
 'Carmignac': 151,
 'Mirny': 614,
 'Guyer-Nobles': 368,
 'Streator': 858,
 'Ciubara': 179,
 'Colpeyn': 186,
 'Mariyappan': 581,
 'Wildchild': 966,
 'Goriachkovsky': 353,
 'Barodka': 65,
 'Discupta': 244,
 'Suntisook': 869,
 'Villégier': 946,
 'Kalafi': 443,
 'Batwara': 71,
 'Gootrad': 352,
 'Chetri': 166,
 'Zvanova': 995,
 'Sbiee': 796,
 'Varetskiy': 930,
 'Johnston-Henry': 434,
 'Hurpy': 405,
 'Bilheimer': 99,
 'Grantski': 356,
 'Ahrary': 10,
 'Muliere': 631,
 'Krasilovska': 495,
 'Granau': 355,
 'Fayol': 299,
 'Mudzhyri': 629,
 'Fitzmeyer': 312,
 'Omeiza': 672,
 "N'bokolo": 639,
 'Celma': 157,
 'Röhre': 776,
 'Ezinga': 291,
 'Ritziu-Ilka': 751,
 'Cumetti': 207,
 'Shaodong': 821,
 'Cheledinas': 164,
 'Galynsky': 330,
 'Gellrich': 335,
 'Prösler': 725,
 'Kamalinia': 446,
 'Hiniger': 390,
 'Stroegerer': 859,
 'Tvrdá': 912,
 'Variyath': 932,
 'Hoeps': 392,
 'Tavlaridou': 885,
 'Bittlestone': 102,
 'An

## Create LHS<a class="anchor" id="fourth-bullet"></a>

[Back to the Table of Contents](#zero-bullet)

Transform the validation data into spectrograms

In [24]:
# create a list of misspelling and an according list of labels
misspellings = list(val_df[['mistake_1', 'mistake_2', 'mistake_3', 'mistake_4']].values.flatten())
misspellings_labels  = np.repeat(val_df[f'label_{RHS}'], 4)

In [26]:
# num samples (1000 * 4)
num_misspellings = len(misspellings)
num_misspellings

4000

In [27]:
# create spectrograms for misspellings
start = time.time()

# the smaller is the batch size, the better the produced mel spectrograms are and the more time it takes.
# Since the batch_size for generation were chosen to be 5, it's the optimal size for validation.
# Note: Producing ~4000 spectrograms with batch_size 5 takes ~100 seconds
batch_size = 5

name_batches = [misspellings[i * batch_size: (i + 1) * batch_size] for i in range(int(num_misspellings / batch_size))]

mel_outputs_postnet, texts = [], []

for n in name_batches:
    mel_output, text = tacotron2.encode_batch(n)
    mel_outputs_postnet += mel_output.cpu()
    texts += text
    
stop = time.time()

print(f'Creation of {num_misspellings} mel spectrograms took us {stop - start} seconds')

Creation of 4000 mel spectrograms took us 93.2628390789032 seconds


In [73]:
# drop nulls from each spectrogram
mel_unified = [drop_nulls(mel.cpu().squeeze()) for mel in mel_outputs_postnet]

In [74]:
# resize each spectrogram to fixed size, that was used during training
mel_unified = [F.resize(mel.unsqueeze(0).unsqueeze(1), [80, 100]) for mel in mel_unified]

In [75]:
# stack the spectrograms into one tensor
mel_unified = torch.stack(mel_unified).squeeze(1)

In [76]:
# inspect the outcome shape
mel_unified.shape

torch.Size([4000, 1, 80, 100])

In [77]:
# put the spectrograms into DataLoader and specify batch size
mel_unified = DataLoader(mel_unified, batch_size=32)

# Analysis<a class="anchor" id="fifth-bullet"></a>

[Back to the Table of Contents](#zero-bullet)

### Initialize the desired model

In [9]:
device = torch.device('cuda') # or CPU

In [10]:
# Choose the desired model for validation
# Num labels: 1.000 : Model num
# SCNN var.1 (ReLU) : 1
# SCNN var.1 (Sigm) : 2
# SCNN var.2 (ReLU) : 3
# SCNN var.2 (Sigm) : 4

# Num labels: 10.000
# SCNN var.3 (ReLU) : 5
# SCNN var.3 (Sigm) : 6
# SCNN var.4 (ReLU) : 7
# SCNN var.4 (Sigm) : 8

model_num = 6

Import the model

In [11]:
model = import_model(root, model_num)

Forward method imported from: C:\Users\Oleg\Documents\Masterarbeit\STT\models\architecture_6
State dict imported for best loss model


Rebase to the desired device

In [71]:
model.to(device)

CNN(
  (conv_layers): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Sigmoid()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Sigmoid()
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): Sigmoid()
    (11): Dropout(p=0.25, inplace=False)
  )
  (lstm): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.25, bidirectional=True)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc_layers): Sequential(
    (0): Linear(in_features=512, ou

In [84]:
# Specify the threshold t (t <= num_labels)
t = 1000

In [78]:
# create predictions using the initialized model in evaluation mode
outputs = []
model.eval()
with torch.no_grad():
    for data in mel_unified:
        data = data.to(device)
        # data = data.permute(0, 3, 1, 2)

        # Forward pass
        output = model(data)
        _, predicted = torch.topk(output.data, t, dim=1)
        outputs.append(predicted.cpu())
predicted = torch.cat(outputs, dim=0)
to_predict = [t[2] for t in texts]

In [80]:
# find the classification distribution and the number of 'bad classifications'
found_dict = {name: -100 for name in to_predict}
for prediction, name in zip(predicted.numpy(), to_predict):
    label_to_predict = val_df[val_df.isin([name]).any(axis=1)][f'label_{RHS}'].values[0]
    # print(f"LF: {name}: {label_to_predict}")
    try:
        found = list(prediction).index(label_to_predict)
        # print(f'Found on pos {found}')
    except ValueError:
        found = -1
        # print('Not found.')
    found_dict[name] = found
    
# Uncomment to show the distribution histogram
# sns.set_style("whitegrid")
# sns.set(rc={"figure.figsize": (30, 10)})
# sns.histplot(x=list(found_dict.values()))

Number of the exact matches, i.e. threshold t = 0

In [82]:
exact_matches = sum([1 for name, pos in found_dict.items() if pos == 0])
print(f'{exact_matches} exact matches - {num_misspellings - exact_matches} missmatches  - {((num_misspellings-exact_matches)*100)/num_misspellings:.2f} %')

2926 exact matches - 1074 missmatches  - 26.85 %


In [85]:
nbins = 50
fig = go.Figure()
fig.add_trace(go.Histogram(x=list(found_dict.values()), nbinsx=nbins))
fig.update_layout(
    # title_text='Sampled Results', # title of plot
    xaxis_title_text='Pos. found', # xaxis label
    yaxis_title_text=f'Aggregate ({int(max(list(found_dict.values()))/nbins)} per bin)', # yaxis label
    # bargap=0.2, # gap between bars of adjacent location coordinates
    bargroupgap=0.01 # gap between bars of the same location coordinates
)

Max displaced label

In [91]:
max(found_dict.values())

914

Inspect the cluster under t = 300

In [88]:
t = 300
cluster = [v for k, v in found_dict.items() if v <= t]

In [90]:
num_not_found = sum(np.array(list(found_dict.values())) > t)
print(f'Not found ratio = {num_not_found / len(found_dict) * 100:.2f}% <<===>> {num_not_found} names')

Not found ratio = 0.53% <<===>> 21 names


In [92]:
fig = px.histogram(x=cluster, nbins=nbins)
fig.update_layout(
    # title_text='Sampled Results', # title of plot
    xaxis_title_text='Pos. found', # xaxis label
    yaxis_title_text=f'Aggregate ({int(max(cluster)/nbins)} per bin)', # yaxis label
    # bargap=0.2, # gap between bars of adjacent location coordinates
    bargroupgap=0.01 # gap between bars of the same location coordinates
)

In [160]:
# Save the Figure
fig.write_image('var1 relu.png')

In [94]:
# Fit the matching distribution to exponential distribution

distribution = np.array(list(found_dict.values()))
fit_params = expon.fit(distribution)
fit_params

(-1.0, 7.99)

fit_params are (bias, expected_value + bias)

In [95]:
# Calculate the expected value, variance, skewnes and kurtosis
mean, var, skew, kurt = expon.stats(moments='mvsk', loc=fit_params[0], scale=fit_params[1])

In [98]:
mean

array(6.99)

Show the fitness curve

In [99]:
x = np.linspace(fit_params[0], max(cluster), max(cluster))
fig = px.histogram(x=cluster, histnorm='probability density', nbins=nbins)
fig.add_trace(go.Line(x=x, y=expon.pdf(x, loc=fit_params[0], scale=fit_params[1]), name='Expon. distr.'))

fig.update_layout(
    # title_text='Sampled Results', # title of plot
    xaxis_title_text='Pos. found', # xaxis label
    yaxis_title_text=f'Probability density', # yaxis label
    # bargap=0.2, # gap between bars of adjacent location coordinates
    bargroupgap=0.01 # gap between bars of the same location coordinates
)


plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




In [172]:
fig.write_image('expon_distr.png')

Analyse the labels, that weren't predicted correctly

In [100]:
unpredicted = {i: {'original':kv[0], 'misspelled': val_df[val_df.isin([kv[0]]).any(axis=1)]['original'].values[0]} for i, kv in enumerate(found_dict.items()) if kv[1] == -1}

In [101]:
unpredicted

{872: {'original': 'Hyrkkoo', 'misspelled': 'Hyrkkö'},
 874: {'original': 'Hyrko', 'misspelled': 'Hyrkkö'},
 1584: {'original': 'Oztunc', 'misspelled': 'Öztunc'},
 3101: {'original': 'Boihemm', 'misspelled': 'Boihem'},
 3349: {'original': 'Mlado…цi„?', 'misspelled': 'Mladosic'},
 3427: {'original': 'Bartkovaa', 'misspelled': 'Bártková'},
 3592: {'original': 'Waukeee', 'misspelled': 'Wauke'},
 3814: {'original': 'Hruzik', 'misspelled': 'Hruzík'},
 3815: {'original': 'Hruzzik', 'misspelled': 'Hruzík'}}

# Analyze weights<a class="anchor" id="sixth-bullet"></a>

[Back to the Table of Contents](#zero-bullet)

In [105]:
fig = make_subplots(rows=8, cols=8)

In [111]:
model_parameters = dict(model.named_parameters())

In [113]:
list(model_parameters.keys())

['conv_layers.0.weight',
 'conv_layers.0.bias',
 'conv_layers.1.weight',
 'conv_layers.1.bias',
 'conv_layers.4.weight',
 'conv_layers.4.bias',
 'conv_layers.5.weight',
 'conv_layers.5.bias',
 'conv_layers.8.weight',
 'conv_layers.8.bias',
 'conv_layers.9.weight',
 'conv_layers.9.bias',
 'lstm.weight_ih_l0',
 'lstm.weight_hh_l0',
 'lstm.bias_ih_l0',
 'lstm.bias_hh_l0',
 'lstm.weight_ih_l0_reverse',
 'lstm.weight_hh_l0_reverse',
 'lstm.bias_ih_l0_reverse',
 'lstm.bias_hh_l0_reverse',
 'lstm.weight_ih_l1',
 'lstm.weight_hh_l1',
 'lstm.bias_ih_l1',
 'lstm.bias_hh_l1',
 'lstm.weight_ih_l1_reverse',
 'lstm.weight_hh_l1_reverse',
 'lstm.bias_ih_l1_reverse',
 'lstm.bias_hh_l1_reverse',
 'fc_layers.0.weight',
 'fc_layers.0.bias',
 'fc_layers.1.weight',
 'fc_layers.1.bias',
 'fc_layers.3.weight',
 'fc_layers.3.bias']

In [119]:
# chose the weights (implemented for convolution layers only)
layer = 'conv_layers.0.weight'

In [125]:

# Create an empty list to store the Heatmap objects
heatmaps = []
weights = model_parameters[layer]

num_rows = 8
num_columns = int(np.ceil(weights.shape[0]/8))

# Generate the Heatmap objects without plotting them
for i in weights:
    kernel = i.detach().cpu().squeeze().numpy()
    heatmap = go.Heatmap(z=kernel, colorscale='Blues', coloraxis = "coloraxis")
    heatmaps.append(heatmap)

# Plot the heatmaps without colorbar
fig = make_subplots(rows=num_rows, cols=num_columns, subplot_titles=['Filter {}'.format(i+1) for i in range(weights.shape[0])])

for i, heatmap in enumerate(heatmaps):
    fig.add_trace(heatmap, row=int(i % num_rows)+1, col=int(i / num_rows)+1)
    fig.update_yaxes(autorange=True, row=int(i % num_rows)+1, col=int(i / num_rows)+1)
# Configure the layout and display the figure
fig.update_layout(
    margin=dict(t=20, r=200, b=10, l=10),
    width=700,
    height=700,
    autosize=False,
)
# fig.update_yaxes(autorange="reversed")

# Adjust the size of subplot titles
for annotation in fig['layout']['annotations']:
    annotation['font'] = dict(size=10)  # Adjust the font size of the titles

# Add a separate colorbar to the layout

fig.show()




In [137]:
fig.write_image('kernels.png')