In [19]:
import fasttext
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

words = []
labels = []

# Load your dataset
f = open("./news- verified- final level.txt", "r", encoding='utf-8')
for x in f:
    if len(x.split(" ")) != 2:
        continue
    if x.split(" ")[0] in words:
        continue
    words.append(x.split(" ")[0].strip())
    labels.append(x.split(" ")[1].strip())

data = {'word': words,
        'tag': labels}

# Convert to DataFrame
df = pd.DataFrame(data)

Xtest = df['word'].values
X = []
y = df['tag'].values

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Load FastText model
ft_model = fasttext.load_model('./cc.si.300.bin')

# Get embeddings for each word in the dataset
for word in Xtest:
    embedding = ft_model.get_word_vector(word)
    X.append(embedding)

# Convert to numpy arrays
X = np.array(X)
y = np.array(y_encoded)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Define the neural network model
model = Sequential()
model.add(Dense(512, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.5))  # Add dropout to avoid overfitting
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))  # Add dropout to avoid overfitting
model.add(Dense(len(np.unique(y)), activation='softmax'))  # Output layer, one unit per class

# Compile the model
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=16, validation_data=(X_test, y_test))

# Predict the tags on the test data
y_pred = np.argmax(model.predict(X_test), axis=1)

# Print classification report
report = classification_report(y_test, y_pred)
print(report)

# Prediction function using the trained neural network model
def predict_tags(sentence):
    words = sentence.split()
    tags = []
    
    for word in words:
        # Get the embedding for each word in the sentence
        embedding = ft_model.get_word_vector(word)
        # Predict the tag using the neural network
        tag_encoded = np.argmax(model.predict(np.array([embedding])), axis=1)[0]
        tag = label_encoder.inverse_transform([tag_encoded])[0]
        tags.append((word, tag))
    
    return tags

# Example usage
sentence = "කිම් ජොං අන් ජනපති ට්‍රම්ප්ට ඉක්මන් සුවය පතයි"
predictions = predict_tags(sentence)
print(predictions)


Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1655/1655[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 9ms/step - accuracy: 0.5661 - loss: 1.6974 - val_accuracy: 0.6900 - val_loss: 1.1399
Epoch 2/20
[1m1655/1655[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 9ms/step - accuracy: 0.6816 - loss: 1.1737 - val_accuracy: 0.6935 - val_loss: 1.1119
Epoch 3/20
[1m1655/1655[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 8ms/step - accuracy: 0.6914 - loss: 1.1705 - val_accuracy: 0.6980 - val_loss: 1.1131
Epoch 4/20
[1m1655/1655[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 8ms/step - accuracy: 0.6939 - loss: 1.1767 - val_accuracy: 0.6989 - val_loss: 1.1302
Epoch 5/20
[1m1655/1655[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 8ms/step - accuracy: 0.6945 - loss: 1.2013 - val_accuracy: 0.7004 - val_loss: 1.1542
Epoch 6/20
[1m1655/1655[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 8ms/step - accuracy: 0.6880 - loss: 1.2366 - val_accuracy: 0.6981 - val_loss: 1.1576
Epoch 7/20
[1m1655/1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[('කිම්', 'NNP'), ('ජොං', 'NNP'), ('අන්', 'DET'), ('ජනපති', 'NNC'), ('ට්\u200dරම්ප්ට', 'NNC'), ('ඉක්මන්', 'NNC'), ('සුවය', 'NNC'), ('පතයි', 'VFM')]
