# Vocab & Toknization & Padding

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os
import json

# Load the CSV files
train_csv_path = 'C:/AIC/new_Start/train.csv'
adapt_csv_path = 'C:/AIC/new_Start/adapt.csv'

train_df = pd.read_csv(train_csv_path)
adapt_df = pd.read_csv(adapt_csv_path)

# Combine train and adapt dataframes for preprocessing
combined_df = pd.concat([train_df, adapt_df])

# Clean the transcript column
combined_df['transcript'] = combined_df['transcript'].fillna('').astype(str)
train_df['transcript'] = train_df['transcript'].fillna('').astype(str)
adapt_df['transcript'] = adapt_df['transcript'].fillna('').astype(str)

# Add special tokens to transcripts
combined_df['transcript'] = combined_df['transcript'].apply(lambda x: '<start> ' + x + ' <end>')
train_df['transcript'] = train_df['transcript'].apply(lambda x: '<start> ' + x + ' <end>')
adapt_df['transcript'] = adapt_df['transcript'].apply(lambda x: '<start> ' + x + ' <end>')

# Initialize the tokenizer
tokenizer = Tokenizer(oov_token='<OOV>')

# Fit the tokenizer on the combined transcripts
tokenizer.fit_on_texts(combined_df['transcript'])

# Extract the word index
word_index = tokenizer.word_index

# Save the word index for future use
vocab_path = 'C:/AIC/LAST_ONE/vocab'
os.makedirs(vocab_path, exist_ok=True)
with open(os.path.join(vocab_path, 'word_index.json'), 'w') as f:
    json.dump(word_index, f)

# Display some of the word index
print("Word index (first 10):", list(word_index.items())[:10])

# Convert transcripts to sequences of token indices
train_sequences = tokenizer.texts_to_sequences(train_df['transcript'])
adapt_sequences = tokenizer.texts_to_sequences(adapt_df['transcript'])

# Determine the maximum sequence length for padding
max_length = max(max(len(seq) for seq in train_sequences), max(len(seq) for seq in adapt_sequences))

# Pad the sequences
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
adapt_padded = pad_sequences(adapt_sequences, maxlen=max_length, padding='post', truncating='post')

# Verify padding
print("Train sequences shape:", train_padded.shape)
print("Adapt sequences shape:", adapt_padded.shape)

# Save the padded sequences
preprocessed_data_path = 'C:/AIC/LAST_ONE/preprocessed/data'
os.makedirs(preprocessed_data_path, exist_ok=True)

np.save(os.path.join(preprocessed_data_path, 'train_padded.npy'), train_padded)
np.save(os.path.join(preprocessed_data_path, 'adapt_padded.npy'), adapt_padded)

# Save additional files if needed
np.save(os.path.join(preprocessed_data_path, 'max_length.npy'), max_length)


Word index (first 10): [('<OOV>', 1), ('start', 2), ('end', 3), ('في', 4), ('و', 5), ('أنا', 6), ('يعني', 7), ('من', 8), ('اللي', 9), ('إن', 10)]
Train sequences shape: (50715, 94)
Adapt sequences shape: (2199, 94)


# Feature Extraction

In [None]:
import librosa
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

# Paths to the folders containing the audio files
train_audio_path = 'C:/AIC/new_Start/train/'
adapt_audio_path = 'C:/AIC/new_Start/adapt/'

# Load the CSV files
train_csv_path = 'C:/AIC/new_Start/train.csv'
adapt_csv_path = 'C:/AIC/new_Start/adapt.csv'

train_df = pd.read_csv(train_csv_path)
adapt_df = pd.read_csv(adapt_csv_path)

# Define a function to extract MFCC features from an audio file
def extract_features(file_path, n_mfcc=13):
    y, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    return mfccs.mean(axis=1)

# Extract features for the training set
train_features = []
for wav_id in tqdm(train_df['audio']):
    file_path = os.path.join(train_audio_path, wav_id + '.wav')
    features = extract_features(file_path)
    train_features.append(features)

train_features = np.array(train_features)

# Extract features for the adaptation set
adapt_features = []
for wav_id in tqdm(adapt_df['audio']):
    file_path = os.path.join(adapt_audio_path, wav_id + '.wav')
    features = extract_features(file_path)
    adapt_features.append(features)

adapt_features = np.array(adapt_features)

# Verify the shape of the extracted features
print("Train features shape:", train_features.shape)
print("Adapt features shape:", adapt_features.shape)

# Save the extracted features for future use
preprocessed_data_path = 'C:/AIC/LAST_ONE/preprocessed/data'
os.makedirs(preprocessed_data_path, exist_ok=True)

np.save(os.path.join(preprocessed_data_path, 'train_features.npy'), train_features)
np.save(os.path.join(preprocessed_data_path, 'adapt_features.npy'), adapt_features)


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, Input, LayerNormalization, Dropout
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
import os
import json
from tensorflow.keras.optimizers import Adam


# Custom Layers
class MultiHeadSelfAttention(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads=8, **kwargs):
        super(MultiHeadSelfAttention, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.projection_dim = embed_dim // num_heads

        self.query_dense = Dense(embed_dim)
        self.key_dense = Dense(embed_dim)
        self.value_dense = Dense(embed_dim)
        self.combine_heads = Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)
        key = self.key_dense(inputs)
        value = self.value_dense(inputs)
        query = self.separate_heads(query, batch_size)
        key = self.separate_heads(key, batch_size)
        value = self.separate_heads(value, batch_size)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(attention, (batch_size, -1, self.embed_dim))
        output = self.combine_heads(concat_attention)
        return output

class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
        super(TransformerBlock, self).__init__(**kwargs)
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = tf.keras.Sequential(
            [Dense(ff_dim, activation="relu"), Dropout(rate), Dense(embed_dim)]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(tf.keras.layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim, **kwargs):
        super(TokenAndPositionEmbedding, self).__init__(**kwargs)
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

# Load processed data
train_padded = np.load('/kaggle/input/newdata/train_padded.npy')
adapt_padded = np.load('/kaggle/input/newdata/adapt_padded.npy')
train_features = np.load('/kaggle/input/newdata/train_features.npy')
adapt_features = np.load('/kaggle/input/newdata/adapt_features.npy')

# Reshape features to add a time dimension
train_features = np.expand_dims(train_features, axis=1)
adapt_features = np.expand_dims(adapt_features, axis=1)

# Load the word index
vocab_path = 'C:/AIC/LAST_ONE/vocab'
with open('/kaggle/input/newdata/word_index.json', 'r') as f:
    word_index = json.load(f)

vocab_size = len(word_index) + 1  # Adding 1 for padding token
embedding_dim = 512  # Adjust embedding size as needed
max_length = train_padded.shape[1]

# Model Parameters
embed_dim = 512  # Embedding dimension
num_heads = 8  # Number of attention heads
ff_dim = 1024  # Feedforward network dimension
num_transformer_blocks = 6  # Number of transformer blocks
dropout_rate = 0.2

# Encoder Model
encoder_inputs = Input(shape=(train_features.shape[1], train_features.shape[2]))  # (batch_size, 1, feature_dim)
encoder_dense = Dense(embed_dim)(encoder_inputs)
encoder_outputs = Dropout(dropout_rate)(encoder_dense)
for _ in range(num_transformer_blocks):
    encoder_transformer = TransformerBlock(embed_dim, num_heads, ff_dim, rate=dropout_rate)
    encoder_outputs = encoder_transformer(encoder_outputs, training=True)

# Decoder Model
decoder_inputs = Input(shape=(max_length,))
embedding_layer = TokenAndPositionEmbedding(max_length, vocab_size, embed_dim)
x = embedding_layer(decoder_inputs)
x = Dropout(dropout_rate)(x)
for _ in range(num_transformer_blocks):
    decoder_transformer = TransformerBlock(embed_dim, num_heads, ff_dim, rate=dropout_rate)
    x = decoder_transformer(x, training=True)
x = Dropout(dropout_rate)(x)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(x)

# Complete Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()

# Compile the model
model.compile(optimizer=Adam(learning_rate=1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Prepare decoder target data (shifted by one timestep)
decoder_target_data = np.zeros((train_padded.shape[0], max_length, 1), dtype="float32")
for i, seq in enumerate(train_padded):
    for t in range(1, len(seq)):
        decoder_target_data[i, t - 1, 0] = seq[t]

# Split the data into training and validation sets
X_train_encoder, X_val_encoder, X_train_decoder, X_val_decoder, y_train, y_val = train_test_split(
    train_features, train_padded, decoder_target_data, test_size=0.2, random_state=42)

# Train the model
# Note: The checkpoint callback has been commented out. You can uncomment and configure as needed.
# checkpoint_path = "C:/AIC/LAST_ONE/checkpoints/cp-{epoch:04d}.ckpt"
# checkpoint_dir = os.path.dirname(checkpoint_path)
# os.makedirs(checkpoint_dir, exist_ok=True)
# checkpoint_callback = ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, save_best_only=True, monitor='val_loss', mode='min', verbose=1)

In [None]:
model.fit(
    [X_train_encoder, X_train_decoder], y_train,
    validation_data=([X_val_encoder, X_val_decoder], y_val),
    epochs=3,  # Adjust epochs as needed
    batch_size=32,  # Adjust batch size as needed
    verbose=1
    # , callbacks=[checkpoint_callback]  # Add callbacks if needed
)

# Save the trained model


In [None]:
model.save('/kaggle/working/transformer_model.keras')

# Inference

In [None]:
# Assuming your model has been trained and stored in the variable `model`

# Select a random sample from your training data
sample_index = 5  # Change this index to select different samples
input_encoder = np.expand_dims(adapt_features[sample_index], axis=0)
input_decoder = np.expand_dims(adapt_padded[sample_index], axis=0)

# Predict on the sample
predictions = model.predict([input_encoder, input_decoder])

# Decode the prediction, expected output (decoder), and actual output to words
decoded_output = []
decoded_prediction = []
decoded_expected_output = []

# Ensure word_index is properly defined
reverse_word_index = {v: k for k, v in word_index.items()}

# Decode predicted output
for token_index in predictions[0]:
    sampled_token_index = np.argmax(token_index)
    if sampled_token_index in reverse_word_index:
        sampled_word = reverse_word_index[sampled_token_index]
    else:
        sampled_word = '<UNK>'  # Handle out-of-vocabulary tokens
    decoded_prediction.append(sampled_word)

# Decode expected output (decoder input)
for token_index in input_decoder[0]:
    sampled_token_index = int(token_index)
    if sampled_token_index in reverse_word_index:
        sampled_word = reverse_word_index[sampled_token_index]
    else:
        sampled_word = '<UNK>'  # Handle out-of-vocabulary tokens
    decoded_expected_output.append(sampled_word)

# Filter out <UNK> tokens from decoded_prediction and decoded_expected_output
decoded_prediction = [word for word in decoded_prediction if word != '<UNK>']
decoded_expected_output = [word for word in decoded_expected_output if word != '<UNK>']

# Print the results in a structured format
print("Sample Input (Encoder):")
print(train_features[sample_index])

print("\nExpected Output (Decoder Input):")
print(decoded_expected_output)

print("\nPredicted Output (Decoder Output):")
print(decoded_prediction)


# Load Model

In [None]:
# Loading the Model with custom objects
loaded_model = load_model("7ot elmodel path", custom_objects={
    'MultiHeadSelfAttention': MultiHeadSelfAttention,
    'TransformerBlock': TransformerBlock,
    'TokenAndPositionEmbedding': TokenAndPositionEmbedding
})