# 🦅**Go to NN**

In [None]:
# Input essential libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import keras
from keras import optimizers
from keras import backend as K
from keras import regularizers
from keras.models import Sequential
from keras.layers import LSTM, Dense, Activation, Dropout, Flatten
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.utils import plot_model
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping

from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import os, re, csv, math, codecs

sns.set_style("whitegrid")
np.random.seed(50)

MAX_NB_WORDS = 10000


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Loading the pre-trained Word2Vec model
import gensim
from gensim.models import FastText
from gensim.models import KeyedVectors
from gensim.models.fasttext import load_facebook_vectors

In [None]:
lm = load_facebook_vectors('/content/drive/MyDrive/cc.am.300.bin.gz')

In [None]:
print('found %s word vectors' % len(lm))

In [None]:
#load data
train_df = pd.read_csv('/content/drive/MyDrive/New_Approach/train_90.csv', sep=',', header=0)
test_df = pd.read_csv('/content/drive/MyDrive/New_Approach/test_10.csv', sep=',', header=0)
test_df = test_df.fillna('_NA_')

print("train set: ", train_df.shape[0])
print("test set: ", test_df.shape[0])

label_names = ["label"]
y_train = train_df[label_names].values
y_test = test_df[label_names].values

In [None]:
#visualize word distribution
train_df['doc_len'] = train_df['text'].apply(lambda words: len(words.split(" ")))
max_seq_len = np.round(train_df['doc_len'].mean() + train_df['doc_len'].std()).astype(int)
#max_seq_len = 5
sns.histplot(train_df['doc_len'],  kde=True, color='b', label='doc_len')
plt.axvline(x=max_seq_len, color='k', linestyle='--', label='max_len')
plt.title('text length'); plt.legend()
plt.show()

In [None]:
#visualize word distribution
test_df['doc_len'] = test_df['text'].apply(lambda words: len(words.split(" ")))
max_seq_len = np.round(test_df['doc_len'].mean() + test_df['doc_len'].std()).astype(int)
#max_seq_len = 5
sns.histplot(test_df['doc_len'], kde=True, color='b', label='doc_len')
plt.axvline(x=max_seq_len, color='k', linestyle='--', label='max_len')
plt.title('text length'); plt.legend()
plt.show()

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = RegexpTokenizer(r'\w+')

raw_docs_train = train_df['text'].tolist()
raw_docs_test = test_df['text'].tolist()
num_classes = len(label_names)

processed_docs_train = []
for doc in tqdm(raw_docs_train):
    tokens = tokenizer.tokenize(doc)
    #filtered = [word for word in tokens if word not in stop_words]
    processed_docs_train.append(" ".join(tokens))
#end for

processed_docs_test = []
for doc in tqdm(raw_docs_test):
    tokens = tokenizer.tokenize(doc)
    #filtered = [word for word in tokens if word not in stop_words]
    processed_docs_test.append(" ".join(tokens))
#end for

print("tokenizing input data...")
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
tokenizer.fit_on_texts(processed_docs_train + processed_docs_test)
word_seq_train = tokenizer.texts_to_sequences(processed_docs_train)
word_seq_test = tokenizer.texts_to_sequences(processed_docs_test)
word_index = tokenizer.word_index
print("dictionary size: ", len(word_index))

#pad sequences
word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len, padding='post')
word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_seq_len, padding='post')

In [None]:
#training params
batch_size = 64
num_epochs = 20

#model parameters
num_filters = 64
embed_dim = 300
weight_decay = 1e-4

In [None]:
#embedding matrix
words_not_found = []
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_dim))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = lm.get_vector(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

## ✔ **CNN**

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Dropout
from keras import optimizers, regularizers

model_1 = Sequential()
model_1.add(Embedding(nb_words, embed_dim, weights=[embedding_matrix], input_length=max_seq_len, trainable=True))
model_1.add(Conv1D(32, 5, activation='relu', padding='same'))
model_1.add(MaxPooling1D(2))
model_1.add(Conv1D(16, 5, activation='relu', padding='same'))
model_1.add(GlobalMaxPooling1D())
model_1.add(Dense(8, activation='relu', kernel_regularizer=regularizers.l2(0.02)))
model_1.add(Dropout(0.3))
model_1.add(Dense(1, activation='sigmoid'))
optimizer = optimizers.Adam(learning_rate=0.0001)
model_1.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [None]:
#define callbacks
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
callbacks_list = [early_stopping]

In [None]:
#model training
hist1 = model_1.fit(word_seq_train, y_train, batch_size=batch_size,
                    epochs=20, validation_split=0.1,
                    verbose=1, callbacks=callbacks_list)

little bit fine tuned

In [None]:
#generate plots
plt.figure()
plt.plot(hist1.history['loss'], lw=2.0, color='b', label='train_loss')
plt.plot(hist1.history['val_loss'], lw=2.0, color='r', label='val_loss')
plt.title('Amharic Idiom Recognition')
plt.xlabel('Epochs')
plt.ylabel('Cross-Entropy Loss')
plt.legend(loc='upper right')
plt.show()

In [None]:
plt.figure()
plt.plot(hist1.history['accuracy'], lw=2.0, color='b', label='train_accuracy')
plt.plot(hist1.history['val_accuracy'], lw=2.0, color='r', label='val_accuracy')
plt.title('Amharic Idiom Recognition ')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='upper left')
plt.show()

## ✔ **LSTM**

In [None]:
model_2 = Sequential()
model_2.add(Embedding(nb_words, embed_dim, weights=[embedding_matrix], trainable=False))
model_2.add(LSTM(64,dropout=0.4,recurrent_dropout=0.4, return_sequences=True))
model_2.add(Dropout(0.3))
model_2.add(LSTM(32,dropout=0.3,recurrent_dropout=0.3))
model_2.add(Dropout(0.3))
model_2.add(Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.02)))
model_2.add(Dropout(0.4))
model_2.add(Dense(1, activation='sigmoid'))
model_2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
#model training
hist2 = model_2.fit(word_seq_train, y_train, batch_size=batch_size, epochs=20,validation_split=0.1, verbose=1, callbacks=callbacks_list)

In [None]:
#generate plots
plt.figure()
plt.plot(hist2.history['loss'], lw=2.0, color='b', label='train_loss')
plt.plot(hist2.history['val_loss'], lw=2.0, color='r', label='val_loss')
plt.title('Amharic Idiom Recognition')
plt.xlabel('Epochs')
plt.ylabel('Cross-Entropy Loss')
plt.legend(loc='upper right')
plt.show()

In [None]:
plt.figure()
plt.plot(hist2.history['accuracy'], lw=2.0, color='b', label='train_accuracy')
plt.plot(hist2.history['val_accuracy'], lw=2.0, color='r', label='val_accuracy')
plt.title('Amharic Idiom Recognition ')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='upper left')
plt.show()

## ✔ **Bi-LSTM**

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

model_3 = Sequential()
model_3.add(Embedding(nb_words, embed_dim, weights=[embedding_matrix], input_length=max_seq_len, trainable=True))
model_3.add(Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.3)))
model_3.add(Dropout(0.4))
model_3.add(Bidirectional(LSTM(32, recurrent_dropout=0.3)))
model_3.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.02)))
model_3.add(Dropout(0.3))
model_3.add(Dense(1, activation='sigmoid'))
model_3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
#model training
hist3 = model_3.fit(word_seq_train, y_train, batch_size=batch_size, epochs=20,validation_split=0.1, verbose=1, callbacks=callbacks_list)

In [None]:
plt.figure()
plt.plot(hist3.history['loss'], lw=2.0, color='b', label='train_loss')
plt.plot(hist3.history['val_loss'], lw=2.0, color='r', label='val_loss')
plt.title('Amharic Idiom Recognition')
plt.xlabel('Epochs')
plt.ylabel('Cross-Entropy Loss')
plt.legend(loc='upper right')
plt.show()

In [None]:
plt.figure()
plt.plot(hist3.history['accuracy'], lw=2.0, color='b', label='train_accuracy')
plt.plot(hist3.history['val_accuracy'], lw=2.0, color='r', label='val_accuracy')
plt.title('Amharic Idiom Recognition ')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='upper left')
plt.show()

## ✔ **GRU**

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, GRU, Dense, Dropout
from keras import optimizers
from keras.callbacks import EarlyStopping

model_4 = Sequential()
model_4.add(Embedding(nb_words, embed_dim, weights=[embedding_matrix], input_length=max_seq_len, trainable=True))
model_4.add(GRU(32, return_sequences=True, recurrent_dropout=0.4))
model_4.add(Dropout(0.5))
model_4.add(GRU(16, recurrent_dropout=0.4))
model_4.add(Dense(8, activation='relu', kernel_regularizer=regularizers.l2(0.03)))
model_4.add(Dropout(0.4))
model_4.add(Dense(1, activation='sigmoid'))
optimizer = optimizers.Adam(learning_rate=0.0001)
model_4.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])


In [None]:

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [None]:
#model training
hist4 = model_4.fit(word_seq_train, y_train, batch_size=batch_size, epochs=20, validation_split=0.1, verbose=1) #, callbacks=callbacks_list)

In [None]:
plt.figure()
plt.plot(hist4.history['loss'], lw=2.0, color='b', label='train_loss')
plt.plot(hist4.history['val_loss'], lw=2.0, color='r', label='val_loss')
plt.title('Idiom Classifcation')
plt.xlabel('Epochs')
plt.ylabel('Cross-Entropy Loss')
plt.legend(loc='upper right')
plt.show()

In [None]:
plt.figure()
plt.plot(hist4.history['accuracy'], lw=2.0, color='b', label='train_accuracy')
plt.plot(hist4.history['val_accuracy'], lw=2.0, color='r', label='val_accuracy')
plt.title('Idiom Classification ')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='upper left')
plt.show()

## ***📊Evaluation Result***

In [None]:
import numpy as np
import tensorflow as tf

# Define the vocabulary size
vocab_size = 8613  # Replace with the actual vocabulary size used in your model

# Check for out-of-bounds indices in word_seq_test and replace them with a special token
out_of_bounds_indices = np.where(word_seq_test >= vocab_size)
if out_of_bounds_indices[0].size > 0:
  word_seq_test[out_of_bounds_indices] = vocab_size - 1  # Replace with a special token (e.g., <UNK>)

# Convert word_seq_test to a TensorFlow tensor with a defined shape
word_seq_test_tensor = tf.constant(word_seq_test)

# Now evaluate the model
model_1.evaluate(word_seq_test_tensor, y_test)

In [None]:
y_pred = np.where(model_1.predict(word_seq_test)>.5,1,0)

In [None]:
y_pred = y_pred.ravel()
y_pred

In [None]:
y_test = y_test.ravel()
y_test

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_pred, y_test))

In [None]:
from sklearn import metrics
import tensorflow as tf
y_pred = tf.cast(y_pred, tf.float32)
print("model Loss:",tf.keras.losses.binary_crossentropy(y_test, y_pred))
print("model Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("model Precision:",metrics.precision_score(y_test, y_pred))
print("model Recall:",metrics.recall_score(y_test, y_pred))
print("model F1-score:",metrics.f1_score(y_test, y_pred))

In [None]:
from sklearn.metrics import roc_curve, auc
y_pred_proba = model_1.predict(word_seq_test)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Calculate fpr, tpr, and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

# Calculate AUC
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Print AUC
print("Area Under the Curve (AUC):", roc_auc)

In [None]:
# Evaluate CNN
loss_cnn, accuracy_cnn = model_1.evaluate(word_seq_test, tf.expand_dims(y_test, axis=-1)) # Add an extra dimension to y_test
print("CNN - Loss: {}, Accuracy: {}".format(loss_cnn, accuracy_cnn))

# Evaluate LSTM
loss_lstm, accuracy_lstm = model_2.evaluate(word_seq_test, tf.expand_dims(y_test, axis=-1)) # Add an extra dimension to y_test
print("LSTM - Loss: {}, Accuracy: {}".format(loss_lstm, accuracy_lstm))

# Evaluate Bi-LSTM
loss_bilstm, accuracy_bilstm = model_3.evaluate(word_seq_test, tf.expand_dims(y_test, axis=-1)) # Add an extra dimension to y_test
print("Bi-LSTM - Loss: {}, Accuracy: {}".format(loss_bilstm, accuracy_bilstm))

# Evaluate GRU
loss_gru, accuracy_gru = model_4.evaluate(word_seq_test, tf.expand_dims(y_test, axis=-1)) # Add an extra dimension to y_test
print("GRU - Loss: {}, Accuracy: {}".format(loss_gru, accuracy_gru))

In [None]:
results = pd.DataFrame({
    'Model': ['CNN', 'LSTM', 'Bi-LSTM', 'GRU'],
    'Loss': [loss_cnn, loss_lstm, loss_bilstm, loss_gru],
    'Accuracy': [accuracy_cnn, accuracy_lstm, accuracy_bilstm, accuracy_gru]
})

print(results)

In [None]:
import matplotlib.pyplot as plt

# Bar plot for accuracy
plt.bar(results['Model'], results['Accuracy'])
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Accuracy Comparison of Models')
plt.show()

# Bar plot for loss
plt.bar(results['Model'], results['Loss'])
plt.xlabel('Model')
plt.ylabel('Loss')
plt.title('Loss Comparison of Models')
plt.show()

In [None]:
import numpy as np

# Set width of bars
barWidth = 0.25

# Set position of bar on X axis
r1 = np.arange(len(results['Accuracy']))
r2 = [x + barWidth for x in r1]

# Make the plot
plt.bar(r1, results['Accuracy'], color='blue', width=barWidth, edgecolor='white', label='Accuracy')
plt.bar(r2, results['Loss'], color='red', width=barWidth, edgecolor='white', label='Loss')

# Add xticks on the middle of the group bars
plt.xlabel('Model', fontweight='bold')
plt.xticks([r + barWidth for r in range(len(results['Accuracy']))], results['Model'])

# Create legend & Show graphic
plt.legend()
plt.title('Accuracy and Loss Comparison of Models')
plt.show()

In [None]:
y_pred = np.where(model_1.predict(word_seq_test) > 0.5, 1, 0)
y_pred = y_pred.ravel()  # Flatten the predictions

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Class 0', 'Class 1'])
disp.plot()
plt.show()

In [None]:
new_text = ["ሊፍት ልስጥህ"]
new_text_tokens = tokenizer.texts_to_sequences(new_text)
new_text_padded = sequence.pad_sequences(new_text_tokens, maxlen=max_seq_len, padding='post')

In [None]:
predictions = model_1.predict(new_text_padded)
print(predictions)

In [None]:
predicted_labels = (predictions > 0.5).astype(int)
print(predicted_labels)