<a href="https://colab.research.google.com/github/benasphy/n-gram/blob/main/n-gram%20with%20Neural%20Nets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import nltk
import requests
from nltk.util import ngrams
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import math
import random
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Download necessary datasets
nltk.download('punkt')
nltk.download('stopwords')

# Download the 'punkt_tab' resource
nltk.download('punkt_tab')

# Load corpus from URL
url = "https://drive.google.com/uc?export=download&id=1WnuCIeZglWU0uty8-uMOBwOWQM-zAHBJ"
response = requests.get(url)
corpus = response.text

# Tokenization
words = nltk.word_tokenize(corpus)

# Function to generate n-grams
def generate_ngrams(words, n):
    return list(ngrams(words, n))

# Generate n-grams
unigrams = generate_ngrams(words, 1)
bigrams = generate_ngrams(words, 2)
trigrams = generate_ngrams(words, 3)
fourgrams = generate_ngrams(words, 4)

# Compute probabilities
def compute_ngram_probabilities(ngrams_list):
    counts = Counter(ngrams_list)
    total_count = sum(counts.values())
    probabilities = {gram: count / total_count for gram, count in counts.items()}
    return probabilities

unigram_probs = compute_ngram_probabilities(unigrams)
bigram_probs = compute_ngram_probabilities(bigrams)
trigram_probs = compute_ngram_probabilities(trigrams)
fourgram_probs = compute_ngram_probabilities(fourgrams)

# Remove stopwords and recompute
amharic_stopwords = {"እና", "እስከ", "እዚህ", "ላይ", "ለ", "በ", "ከ", "የ", "ውስጥ"}  # Add more stopwords
filtered_words = [word for word in words if word not in amharic_stopwords]
filtered_unigrams = generate_ngrams(filtered_words, 1)
filtered_bigrams = generate_ngrams(filtered_words, 2)
filtered_trigrams = generate_ngrams(filtered_words, 3)
filtered_fourgrams = generate_ngrams(filtered_words, 4)

# Neural Network Model for Text Generation
# Tokenization for deep learning
corpus_sentences = corpus.split(". ")  # Split into sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus_sentences)
total_words = len(tokenizer.word_index) + 1

# Prepare sequences for training sequences
input_sequences = []
for sentence in corpus_sentences:
    token_list = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad sequences
max_sequence_length = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Define LSTM Model
model = Sequential([
    Embedding(total_words, 64, input_length=max_sequence_length-1),
    LSTM(100, return_sequences=True),
    LSTM(100),
    Dense(total_words, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=20, verbose=1)

# Generate text using the trained model
def generate_text(seed_text, next_words=10):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        next_word = tokenizer.index_word[predicted.argmax()] if predicted.argmax() in tokenizer.index_word else ""
        seed_text += " " + next_word
    return seed_text

print("Generated Text:", generate_text("ኢትዮጵያ"))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Epoch 1/20




[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 433ms/step - accuracy: 0.0096 - loss: 5.0723
Epoch 2/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 397ms/step - accuracy: 0.0821 - loss: 4.9643
Epoch 3/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 399ms/step - accuracy: 0.0691 - loss: 4.6828
Epoch 4/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 402ms/step - accuracy: 0.0783 - loss: 4.6095
Epoch 5/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 400ms/step - accuracy: 0.0672 - loss: 4.5738
Epoch 6/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 447ms/step - accuracy: 0.0859 - loss: 4.4358
Epoch 7/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 441ms/step - accuracy: 0.0912 - loss: 4.4366
Epoch 8/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 405ms/step - accuracy: 0.0753 - loss: 4.4039
Epoch 9/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━

In [1]:
# Install necessary libraries
!pip install nltk wordcloud tensorflow keras requests

