In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Load dataset
train_data = pd.read_csv("/content/NER dataset.csv", encoding='latin1')
train_data['Sentence #'] = train_data['Sentence #'].ffill()
train_data['Word'] = train_data['Word'].astype(str)
train_data['Tag'] = train_data['Tag'].astype(str)

# Group by 'Sentence #' and concatenate the words to form sentences
sentences = train_data.groupby('Sentence #').apply(
    lambda x: pd.Series({
        'Sentence': ' '.join(x['Word']),
        'Tag': ' '.join(x['Tag'])
    })
).reset_index()

# Create DataFrame with sentences and POS
train_df = sentences[['Sentence', 'Tag']]

# Fit LabelEncoder on the entire POS column
label_encoder = LabelEncoder()
label_encoder.fit(train_df['Tag'].str.split().explode())

# Apply the label encoding to each sentence's POS tags
train_df['Tag_encoded'] = train_df['Tag'].apply(lambda x: label_encoder.transform(x.split()))

# Tokenize and pad sequences
words_limit = 25000
tokenizer = Tokenizer(num_words=words_limit, oov_token='<OOV>')
tokenizer.fit_on_texts(train_df['Sentence'])
train_seq = tokenizer.texts_to_sequences(train_df['Sentence'])
max_seq_len = max([len(x) for x in train_seq])

# Pad the sequences
train_padding = pad_sequences(train_seq, maxlen=max_seq_len, padding='post')
Tag_padding = pad_sequences(train_df['Tag_encoded'].to_list(), maxlen=max_seq_len, padding='post')

# One-hot encode the padded POS sequences
num_classes = len(label_encoder.classes_)
train = np.eye(num_classes)[Tag_padding]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_padding, train, test_size=0.2, random_state=42)

# Define the model
model = Sequential()
model.add(Embedding(input_dim=words_limit, output_dim=128, input_length=max_seq_len))
model.add(Bidirectional(LSTM(units=128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)))
model.add(BatchNormalization())
model.add(Bidirectional(LSTM(units=64, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)))
model.add(BatchNormalization())
model.add(Dense(units=num_classes, activation='softmax'))

# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Add early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
training = model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val), batch_size=64, callbacks=[early_stopping], verbose=1)

# Print the model summary to check the architecture
model.summary()




Epoch 1/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m359s[0m 574ms/step - accuracy: 0.9273 - loss: 0.3078 - val_accuracy: 0.9603 - val_loss: 0.1234
Epoch 2/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m373s[0m 571ms/step - accuracy: 0.9637 - loss: 0.1120 - val_accuracy: 0.9661 - val_loss: 0.1021
Epoch 3/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m378s[0m 565ms/step - accuracy: 0.9682 - loss: 0.0921 - val_accuracy: 0.9679 - val_loss: 0.0961
Epoch 4/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m382s[0m 566ms/step - accuracy: 0.9710 - loss: 0.0817 - val_accuracy: 0.9676 - val_loss: 0.0963
Epoch 5/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m382s[0m 567ms/step - accuracy: 0.9736 - loss: 0.0734 - val_accuracy: 0.9690 - val_loss: 0.0956
Epoch 6/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m383s[0m 569ms/step - accuracy: 0.9753 - loss: 0.0674 - val_accuracy: 0.9687 - val_loss: 0.0971
Epoc

In [None]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# Predict on the validation data
val_predictions = model.predict(X_val)
val_predictions_classes = np.argmax(val_predictions, axis=-1)  # Convert predictions to class labels
y_val_classes = np.argmax(y_val, axis=-1)  # Convert one-hot encoded true labels to class labels

# Flatten the predictions and true labels to evaluate per tag across the entire validation set
y_val_classes_flat = y_val_classes.flatten()
val_predictions_classes_flat = val_predictions_classes.flatten()

# Filter out the padding tokens (which are zero in the one-hot encoding)
mask = (y_val_classes_flat != 0)
y_val_classes_flat = y_val_classes_flat[mask]
val_predictions_classes_flat = val_predictions_classes_flat[mask]

# Calculate metrics
accuracy = accuracy_score(y_val_classes_flat, val_predictions_classes_flat)
precision = precision_score(y_val_classes_flat, val_predictions_classes_flat, average='weighted')
recall = recall_score(y_val_classes_flat, val_predictions_classes_flat, average='weighted')
f1 = f1_score(y_val_classes_flat, val_predictions_classes_flat, average='weighted')

# Print the metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Detailed classification report (optional, to see precision, recall, f1 per class)
print(classification_report(y_val_classes_flat, val_predictions_classes_flat, target_names=label_encoder.classes_))


In [None]:
def ner_predict(input_text, model, tokenizer, label_encoder, max_seq_len):
    # Preprocess the input
    input_seq = tokenizer.texts_to_sequences([input_text])
    input_padding = pad_sequences(input_seq, maxlen=max_seq_len, padding='post')

    # Get the prediction from the model
    predictions = model.predict(input_padding)

    # Decode the predictions
    predicted_classes = np.argmax(predictions, axis=-1)
    predicted_tags = label_encoder.inverse_transform(predicted_classes[0])

    # Map the predictions back to the words in the input
    word_list = input_text.split()
    prediction_dict = {word: tag for word, tag in zip(word_list, predicted_tags) if word != '<OOV>'}

    return prediction_dict

# Example usage
input_text = "Barack Obama was born in Hawaii."
result = ner_predict(input_text, model, tokenizer, label_encoder, max_seq_len)
print(result)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 766ms/step
{'Barack': 'B-per', 'Obama': 'I-per', 'was': 'O', 'born': 'O', 'in': 'O', 'Hawaii.': 'B-geo'}
