# 1st

In [12]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Function to read the dataset
def read_data(file_path):
    sentences = []
    sentence = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip() == "":
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            else:
                word, tag = line.strip().split()
                sentence.append((word, tag))
        if sentence:  # Add the last sentence if there wasn't a newline at the end of the file
            sentences.append(sentence)
    return sentences

# Read the dataset
sentences = read_data('wlina_bd.txt')

# Extract words and tags
words = list(set([w[0] for s in sentences for w in s]))
tags = list(set([w[1] for s in sentences for w in s]))

# Add padding to words and tags
words.append("ENDPAD")
n_words = len(words)
n_tags = len(tags)

# Encode the words and tags
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

# Prepare the data for the model
max_len = 50  # Adjust as needed
X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words - 1)

y = [[tag2idx[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Build the BiLSTM model without CRF
input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words, output_dim=50, input_length=max_len)(input)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
model = TimeDistributed(Dense(n_tags, activation="softmax"))(model)  # Use softmax for output

model = Model(input, model)

# Compile the model with sparse categorical crossentropy loss
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, np.expand_dims(y_train, -1), batch_size=32, epochs=10, validation_split=0.1, verbose=1)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, np.expand_dims(y_test, -1))
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.052900832146406174
Test Accuracy: 0.9892509579658508


  saving_api.save_model(


# Step 2: SAVE BiLSTM-CRF Model

In [13]:
model.save("bilstm_ner_model.keras")

In [15]:
import h5py

# Verify and inspect HDF5 (.h5) file
with h5py.File('bilstm_ner_model.h5', 'r') as h5_file:
    print("HDF5 file contents:")
    for key in h5_file.keys():
        print(key)

HDF5 file contents:
model_weights
optimizer_weights


In [16]:
from tensorflow.keras.models import load_model

# Verify and inspect Keras (.keras) file
model_keras = load_model('bilstm_ner_model.keras')
print("Keras model summary:")
model_keras.summary()

Keras model summary:
Model: "model_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_12 (InputLayer)       [(None, 50)]              0         
                                                                 
 embedding_11 (Embedding)    (None, 50, 50)            616750    
                                                                 
 dropout_11 (Dropout)        (None, 50, 50)            0         
                                                                 
 bidirectional_11 (Bidirect  (None, 50, 200)           120800    
 ional)                                                          
                                                                 
 time_distributed_11 (TimeD  (None, 50, 11)            2211      
 istributed)                                                     
                                                                 
Total params: 739761 (2.82 MB)
Traina

# Test keras model

In [19]:
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the model
model = load_model('bilstm_ner_model.keras')

# Kurdish Sorani text example (replace this with your actual text)
kurdish_text = "مەهدی ئۆزدەمیر رایگەیاند لە هەرێمی کوردستان و بەریتانیا و ئێران و تورکیا پاشان پارتی دیموکراتی کوردستان"

# Preprocess the input text
words = kurdish_text.split()  # Simple split by space
word_indices = [word2idx.get(word, word2idx["ENDPAD"]) for word in words]  # Convert words to indices
X_test = pad_sequences([word_indices], maxlen=50, padding="post", value=n_words - 1)  # Pad sequence

# Predict the tags
y_pred = model.predict(X_test)
predicted_tags = np.argmax(y_pred, axis=-1)

# Convert indices to tags
predicted_tags = [list(tag2idx.keys())[list(tag2idx.values()).index(tag)] for tag in predicted_tags[0]]

# Print the results
for word, tag in zip(words, predicted_tags):
    print(f"{word}: {tag}")

مەهدی: B-PER
ئۆزدەمیر: I-PER
رایگەیاند: O
لە: O
هەرێمی: B-LOC
کوردستان: I-LOC
و: O
بەریتانیا: B-LOC
و: O
ئێران: B-LOC
و: O
تورکیا: B-LOC
پاشان: O
پارتی: B-ORG
دیموکراتی: I-ORG
کوردستان: I-ORG
