In [12]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/giga-tech/data.tsv


In [13]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical


In [14]:
# Load the dataset
file_path = '/kaggle/input/giga-tech/data.tsv'
data = pd.read_csv(file_path, delimiter='\t', header=None, names=['word', 'pos', 'ner'])

print(data.head())


                                                word    pos    ner
0  শনিবার (২৭ আগস্ট) রাতে পটুয়াখালী সদর থানার ভা...    NaN    NaN
1                                             শনিবার    NNP  B-D&T
2                                                (২৭  PUNCT  B-OTH
3                                             আগস্ট)    NNP  B-D&T
4                                               রাতে    NNC  B-D&T


In [15]:
# Handle missing values
data = data.dropna()

# Reset the index
data = data.reset_index(drop=True)

# Tokenize the text
data['word'] = data['word'].apply(lambda x: x.split())

print(data.head())

           word    pos    ner
0      [শনিবার]    NNP  B-D&T
1         [(২৭]  PUNCT  B-OTH
2      [আগস্ট)]    NNP  B-D&T
3        [রাতে]    NNC  B-D&T
4  [পটুয়াখালী]    NNP  B-GPE


In [16]:
# Label encode POS and NER tags
pos_encoder = LabelEncoder()
ner_encoder = LabelEncoder()

data['pos'] = pos_encoder.fit_transform(data['pos'])
data['ner'] = ner_encoder.fit_transform(data['ner'])

print(data.head())



           word  pos  ner
0      [শনিবার]    6    0
1         [(২৭]   11    7
2      [আগস্ট)]    6    0
3        [রাতে]    5    0
4  [পটুয়াখালী]    6    2


In [17]:
# Tokenizer for converting words to numerical indices
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['word'])
word_index = tokenizer.word_index

# Convert words to sequences
data['word'] = tokenizer.texts_to_sequences(data['word'])

print(data.head())



     word  pos  ner
0   [192]    6    0
1  [1735]   11    7
2  [1125]    6    0
3   [501]    5    0
4  [1736]    6    2


In [18]:
# Determine the maximum sequence length
MAX_SEQUENCE_LENGTH = max(data['word'].apply(lambda x: len(x)))

print(MAX_SEQUENCE_LENGTH)




31


In [19]:
# Pad word sequences
X = pad_sequences(data['word'], maxlen=MAX_SEQUENCE_LENGTH, padding='post')

print(X)



[[  192     0     0 ...     0     0     0]
 [ 1735     0     0 ...     0     0     0]
 [ 1125     0     0 ...     0     0     0]
 ...
 [  485     0     0 ...     0     0     0]
 [16262     0     0 ...     0     0     0]
 [   33     0     0 ...     0     0     0]]


In [20]:
# Pad word sequences
X = pad_sequences(data['word'], maxlen=MAX_SEQUENCE_LENGTH, padding='post')

# Convert POS and NER sequences to numpy arrays
pos_array = np.array(data['pos']).reshape(-1, 1)
ner_array = np.array(data['ner']).reshape(-1, 1)

# Pad POS and NER sequences
y_pos = pad_sequences(pos_array, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
y_ner = pad_sequences(ner_array, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

# One-hot encode the labels
y_pos = to_categorical(y_pos, num_classes=len(pos_encoder.classes_))
y_ner = to_categorical(y_ner, num_classes=len(ner_encoder.classes_))

# Split the data
X_train, X_temp, y_train_pos, y_temp_pos, y_train_ner, y_temp_ner = train_test_split(X, y_pos, y_ner, test_size=0.3, random_state=42)
X_val, X_test, y_val_pos, y_test_pos, y_val_ner, y_test_ner = train_test_split(X_temp, y_temp_pos, y_temp_ner, test_size=0.5, random_state=42)




# Define model parameters
input_dim = len(word_index) + 1  # Vocabulary size + 1 for padding
output_dim_pos = y_train_pos.shape[-1]  # Number of POS tag categories
output_dim_ner = y_train_ner.shape[-1]  # Number of NER tag categories
input_length = MAX_SEQUENCE_LENGTH

# Input layer
input_layer = Input(shape=(input_length,))

# Embedding layer
embedding_layer = Embedding(input_dim=input_dim, output_dim=128, input_length=input_length)(input_layer)

# BiLSTM layer
lstm_layer = Bidirectional(LSTM(units=64, return_sequences=True, recurrent_dropout=0.2))(embedding_layer)

# POS tagging output
pos_output = TimeDistributed(Dense(output_dim_pos, activation='softmax'))(lstm_layer)

# NER tagging output
ner_output = TimeDistributed(Dense(output_dim_ner, activation='softmax'))(lstm_layer)

# Define the model
model = Model(inputs=input_layer, outputs=[pos_output, ner_output])

# Compile the model
model.compile(optimizer='adam', 
              loss=['categorical_crossentropy', 'categorical_crossentropy'], 
              metrics=[['accuracy'], ['accuracy']])

# Print model summary
model.summary()

# Train the model
history = model.fit(
    X_train, 
    [y_train_pos, y_train_ner], 
    validation_data=(X_val, [y_val_pos, y_val_ner]),
    epochs=10, 
    batch_size=32, 
    verbose=1
)

# Evaluate the model
scores = model.evaluate(X_test, [y_test_pos, y_test_ner], verbose=1)

# Print the scores to see what it contains
print("Scores:", scores)
print("Length of scores:", len(scores))

# Print results
print(f'Overall Loss: {scores[0]:.4f}')
print(f'POS Tagging Accuracy: {scores[1]:.4f}')
print(f'NER Tagging Accuracy: {scores[2]:.4f}')




Epoch 1/10
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 45ms/step - loss: 0.3230 - time_distributed_6_accuracy: 0.9708 - time_distributed_7_accuracy: 0.9792 - val_loss: 0.0777 - val_time_distributed_6_accuracy: 0.9874 - val_time_distributed_7_accuracy: 0.9900
Epoch 2/10
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 43ms/step - loss: 0.0571 - time_distributed_6_accuracy: 0.9925 - time_distributed_7_accuracy: 0.9915 - val_loss: 0.0531 - val_time_distributed_6_accuracy: 0.9923 - val_time_distributed_7_accuracy: 0.9929
Epoch 3/10
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 43ms/step - loss: 0.0264 - time_distributed_6_accuracy: 0.9978 - time_distributed_7_accuracy: 0.9954 - val_loss: 0.0512 - val_time_distributed_6_accuracy: 0.9928 - val_time_distributed_7_accuracy: 0.9938
Epoch 4/10
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 42ms/step - loss: 0.0175 - time_distributed_6_accuracy: 0.9983 - time_distri

In [21]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

# Evaluate the model
scores = model.evaluate(X_test, [y_test_pos, y_test_ner], verbose=1)

# Print the scores to see what it contains
print("Scores:", scores)
print("Length of scores:", len(scores))

# Extract individual losses and metrics
loss_pos = scores[0]  # Loss for POS tagging
accuracy_pos = scores[1]  # Accuracy for POS tagging
accuracy_ner = scores[2]  # Accuracy for NER tagging

print(f'Overall Loss: {loss_pos:.4f}')
print(f'POS Tagging Accuracy: {accuracy_pos:.4f}')
print(f'NER Tagging Accuracy: {accuracy_ner:.4f}')

# Step 1: Get model predictions
y_pred = model.predict(X_test)
y_pred_pos, y_pred_ner = y_pred

# Step 2: Convert predictions to class labels
# For POS tagging
y_pred_pos_labels = np.argmax(y_pred_pos, axis=-1)
y_test_pos_labels = np.argmax(y_test_pos, axis=-1)

# For NER tagging
y_pred_ner_labels = np.argmax(y_pred_ner, axis=-1)
y_test_ner_labels = np.argmax(y_test_ner, axis=-1)

# Flatten arrays for metric calculation
y_pred_pos_labels_flattened = y_pred_pos_labels.flatten()
y_test_pos_labels_flattened = y_test_pos_labels.flatten()
y_pred_ner_labels_flattened = y_pred_ner_labels.flatten()
y_test_ner_labels_flattened = y_test_ner_labels.flatten()

# Step 3: Compute metrics for POS tagging
precision_pos = precision_score(y_test_pos_labels_flattened, y_pred_pos_labels_flattened, average='weighted')
recall_pos = recall_score(y_test_pos_labels_flattened, y_pred_pos_labels_flattened, average='weighted')
f1_pos = f1_score(y_test_pos_labels_flattened, y_pred_pos_labels_flattened, average='weighted')

# Compute metrics for NER tagging
precision_ner = precision_score(y_test_ner_labels_flattened, y_pred_ner_labels_flattened, average='weighted')
recall_ner = recall_score(y_test_ner_labels_flattened, y_pred_ner_labels_flattened, average='weighted')
f1_ner = f1_score(y_test_ner_labels_flattened, y_pred_ner_labels_flattened, average='weighted')

# Print metrics
print(f'POS Tagging - Precision: {precision_pos:.4f}, Recall: {recall_pos:.4f}, F1 Score: {f1_pos:.4f}')
print(f'NER Tagging - Precision: {precision_ner:.4f}, Recall: {recall_ner:.4f}, F1 Score: {f1_ner:.4f}')


[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - loss: 0.0585 - time_distributed_6_accuracy: 0.9920 - time_distributed_7_accuracy: 0.9943
Scores: [0.05945611000061035, 0.9920052886009216, 0.994273841381073]
Length of scores: 3
Overall Loss: 0.0595
POS Tagging Accuracy: 0.9920
NER Tagging Accuracy: 0.9943
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


POS Tagging - Precision: 0.9932, Recall: 0.9920, F1 Score: 0.9923
NER Tagging - Precision: 0.9939, Recall: 0.9943, F1 Score: 0.9936


  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
# Save the model
model.save('pos_ner_model.h5')
print('Model saved as pos_ner_model.h5')


Model saved as pos_ner_model.h5


In [23]:
from keras.models import load_model
import numpy as np

# Load the saved model
model = load_model('pos_ner_model.h5')
print('Model loaded from pos_ner_model.h5')




Model loaded from pos_ner_model.h5


In [24]:
# -------------------------------------------
# Test the model with a new Bangla sentence
# -------------------------------------------

# Replace this with your new Bangla sentence
new_sentence = "এটি একটি নতুন বাক্য"  # Example sentence

# Step 1: Tokenize the new sentence
new_sentence_seq = tokenizer.texts_to_sequences([new_sentence.split()])

# Step 2: Pad the sequence
new_sentence_padded = pad_sequences(new_sentence_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

# Step 3: Predict POS and NER tags
predictions = model.predict(new_sentence_padded)

# Step 4: Decode predictions to original labels
predicted_pos_labels = np.argmax(predictions[0], axis=-1)
predicted_ner_labels = np.argmax(predictions[1], axis=-1)

# Step 5: Convert indices back to the original tags
predicted_pos_tags = pos_encoder.inverse_transform(predicted_pos_labels[0])
predicted_ner_tags = ner_encoder.inverse_transform(predicted_ner_labels[0])

# Print the results
print("\nPredicted POS and NER tags for the new sentence:")
for word, pos, ner in zip(new_sentence.split(), predicted_pos_tags, predicted_ner_tags):
    print(f'Word: {word}, POS: {pos}, NER: {ner}')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 945ms/step

Predicted POS and NER tags for the new sentence:
Word: এটি, POS: PRO, NER: B-OTH
Word: একটি, POS: QF, NER: B-NUM
Word: নতুন, POS: ADJ, NER: B-OTH
Word: বাক্য, POS: ADJ, NER: B-D&T


In [25]:
!pip freeze > requirements.txt


In [26]:
!pip freeze | grep pandas
!pip freeze | grep scikit-learn
!pip freeze | grep numpy
!pip freeze | grep tensorflow


geopandas==0.14.4
pandas==2.2.2
pandas-datareader==0.10.0
pandas-profiling==3.6.6
pandas-summary==0.2.0
pandasql==0.7.3
sklearn-pandas==2.2.0
scikit-learn==1.2.2
scikit-learn-intelex==2024.5.0
numpy @ file:///home/conda/feedstock_root/build_artifacts/numpy_1707225380409/work/dist/numpy-1.26.4-cp310-cp310-linux_x86_64.whl#sha256=51131fd8fc130cd168aecaf1bc0ea85f92e8ffebf211772ceb16ac2e7f10d7ca
tensorflow==2.15.0
tensorflow-cloud==0.1.16
tensorflow-datasets==4.9.4
tensorflow-decision-forests==1.8.1
tensorflow-estimator==2.15.0
tensorflow-hub==0.16.1
tensorflow-io==0.35.0
tensorflow-io-gcs-filesystem==0.35.0
tensorflow-metadata==0.14.0
tensorflow-probability==0.23.0
tensorflow-serving-api==2.14.1
tensorflow-text==2.15.0
tensorflow-transform==0.14.0
tfp-nightly @ git+https://github.com/tensorflow/probability.git@fbc5ebe9b1d343113fb917010096cfd88b32eecf
