In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
!pip install wikipedia
import numpy as np
import random
import wikipedia
import nltk
import re
from gensim.models import KeyedVectors  # for GloVe
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Dropout, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import string
from collections import Counter

# Set the language of Wikipedia to English
wikipedia.set_lang("en")

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Preprocessing Steps
def preprocess_text(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatize tokens
    lemmatizer = nltk.stem.WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

# Define the topic
topic = "The solar system"

# Fetch Wikipedia page for the topic
try:
    page = wikipedia.page(topic)
    content = page.content
    # Preprocess content
    content_tokens = preprocess_text(content)
except wikipedia.exceptions.PageError:
    print("Wikipedia page for the given topic is not found.")
    exit(1)

# Splitting content into 5 documents
num_documents = 5
doc_length = len(content_tokens) // num_documents
# Slices the content_tokens list to extract tokens corresponding to the current document
all_docs_tokens = [content_tokens[i * doc_length: (i + 1) * doc_length] for i in range(num_documents)]

# GloVe embeddings -> pre-trained word vectors that capture semantic relationships between words

# Load GloVe embeddings for words embeddings
glove_file = '/content/drive/MyDrive/NLP(RNN)/glove.6B.100d.txt'
glove_model = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)
embedding_dim = glove_model.vector_size    # retrieves the dimensionality of the word embeddings (size of them)

# Define sequence lengths for training
char_sequence_length = 30
word_sequence_length = 30

# Create a set of unique characters from the content (to ensure that each character appears only once in the set)
chars = sorted(set([char.lower() for char in content if char in string.printable]))

# Filter out rare words to reduce the vocabulary size and they are noise in the data and can lead to overfitting
word_counts = nltk.FreqDist(word for doc in all_docs_tokens for word in doc)
filtered_words = [word for word, count in word_counts.items() if count > 5]

print(filtered_words)

# Prepare input data for word-based model
X_word = []
y_word = []
for doc_tokens in all_docs_tokens:
    for i in range(len(doc_tokens) - word_sequence_length):
        seq_in = doc_tokens[i:i + word_sequence_length]
        seq_out = doc_tokens[i + word_sequence_length]
        if seq_out in filtered_words:
            X_word.append([glove_model[word] if word in glove_model else np.zeros(embedding_dim) for word in seq_in])
            y_word.append(to_categorical(filtered_words.index(seq_out), num_classes=len(filtered_words)))

X_word = np.array(X_word)
y_word = np.array(y_word)

# Build the word-based model with 4 SimpleRNN layers
model_word = Sequential([
    SimpleRNN(512, input_shape=(word_sequence_length, embedding_dim), return_sequences=True),
    Dropout(0.3),
    SimpleRNN(512, return_sequences=True),
    Dropout(0.3),
    SimpleRNN(512, return_sequences=True),
    Dropout(0.3),
    SimpleRNN(512),
    Dropout(0.3),
    Dense(len(filtered_words), activation='softmax')
])
model_word.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001))

# Train the word-based model
model_word.fit(X_word, y_word, epochs=100, batch_size=256)

# Prepare input data for character-based model
X_char = []
y_char = []
for i in range(len(content) - char_sequence_length):
    seq_in = content[i:i + char_sequence_length]
    seq_out = content[i + char_sequence_length]
    if seq_out in chars:  # Ensure the output character is in the list of characters
        X_char.append([glove_model[char] if char in glove_model else np.zeros(embedding_dim) for char in seq_in])
        y_char.append(to_categorical(chars.index(seq_out), num_classes=len(chars)))

X_char = np.array(X_char)
y_char = np.array(y_char)

# Build the character-based model with 4 SimpleRNN layers
model_char = Sequential([
    SimpleRNN(512, input_shape=(char_sequence_length, embedding_dim), return_sequences=True),
    Dropout(0.3),  # prevent overfitting
    SimpleRNN(512, return_sequences=True),
    Dropout(0.3),
    SimpleRNN(512, return_sequences=True),
    Dropout(0.3),
    SimpleRNN(512),
    Dropout(0.3),
    Dense(len(chars), activation='softmax')
])
model_char.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001))

# Train the character-based model
model_char.fit(X_char, y_char, epochs=100, batch_size=256)

# Temperature parameter -> controls the randomness or diversity of the generated text
# Text generation function with temperature parameter for character
def generate_text_char(seed_text, num_chars, temperature=1.0):  # num_chars -> the number of characters to generate
    generated_text = seed_text
    for _ in range(num_chars):
        x_pred = np.zeros((1, char_sequence_length, embedding_dim))  # x_pred holds the input data for prediction(1 -> represent a single input sequence)
        # Take a fixed-size window of characters from seed_text
        for t, char in enumerate(seed_text[-char_sequence_length:]):
            if char in glove_model:
                x_pred[0, t] = glove_model[char]
        preds = model_char.predict(x_pred)[0]
        next_index = sample(preds, temperature)
        next_char = chars[next_index]
        generated_text += next_char
        seed_text = seed_text[1:] + next_char  # Move the window by one character
    return generated_text

# Text generation function with temperature parameter for word
def generate_text_word(seed_text, num_words, temperature=1.0):
    generated_text = seed_text
    for _ in range(num_words):
        x_pred = np.zeros((1, word_sequence_length, embedding_dim))
        seed_text_split = seed_text.split()[-word_sequence_length:]
        for t, word in enumerate(seed_text_split):
            if word in glove_model and word in filtered_words:
                x_pred[0, t] = glove_model[word]
        preds = model_word.predict(x_pred)[0]
        next_index = sample(preds, temperature)
        next_word = filtered_words[next_index]
        generated_text += " " + next_word
        seed_text = ' '.join(seed_text.split()[1:]) + " " + next_word
    return generated_text

# Function to select the next character or word based on the predicted probabilities generated by the model
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# Generate text using character-based model for at least 10 iterations
initial_char_seed = content[:char_sequence_length]
generated_text_char = generate_text_char(initial_char_seed, 10, temperature=0.5)

# Generate text using word-based model for at least 10 iterations
initial_word_seed = ' '.join(content_tokens[:word_sequence_length])
generated_text_word = generate_text_word(initial_word_seed, 10, temperature=0.5)

print("Generated text using character-based model:")
print(generated_text_char)

print("\nGenerated text using word-based model:")
print(generated_text_word)




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['solar', 'system', 'gravitationally', 'sun', 'object', 'orbit', 'formed', 'billion', 'year', 'region', 'cloud', 'disc', 'star', 'fusion', 'hydrogen', 'helium', 'core', 'outer', 'largest', 'planet', 'four', 'terrestrial', 'mercury', 'venus', 'earth', 'mar', 'two', 'gas', 'giant', 'jupiter', 'saturn', 'ice', 'uranus', 'neptune', 'solid', 'surface', 'mainly', 'composed', 'mass', 'strong', 'astronomer', 'dwarf', 'pluto', 'makemake', 'small', 'body', 'asteroid', 'comet', 'centaur', 'meteoroid', 'dust', 'belt', 'kuiper', 'orbiting', 'natural', 'satellite', 'called', 'moon', 'particle', 'wind', 'heliosphere', 'around', 'interstellar', 'space', 'theorized', 'oort', 'source', 'radius', 'lightyears', 'closest', 'centauri', 'au', 'away', 'milky', 'way', 'gravitational', 'within', 'large', 'likely', 'one', 'mostly', 'heavier', 'element', 'center', 'diameter', 'roughly', 'km', 'mi', 'form', 'larger', 'may', 'ejected', 'due', 'point', 'metal', 'silicate', 'could', 'inner', 'close', 'would', 'beyond