# **Attention based Models and  Transfer Learning**

1. How to implement a simple text classification model using LSTM in Keras

In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

# 1. Load Dataset
# Example dataset: You can replace this with any text classification dataset.
data = {
    "text": [
        "I love this product, it's amazing!",
        "This is the worst item I have ever bought.",
        "Absolutely fantastic experience, highly recommend!",
        "Not worth the money, very disappointed.",
        "Best purchase I've made this year!",
    ],
    "label": ["positive", "negative", "positive", "negative", "positive"],
}
df = pd.DataFrame(data)

# 2. Preprocess Data
# Tokenize text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['text'])
X = tokenizer.texts_to_sequences(df['text'])
X = pad_sequences(X, maxlen=50)

# Encode labels
encoder = LabelBinarizer()
y = encoder.fit_transform(df['label'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Build LSTM Model
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=50),  # Embedding layer
    LSTM(128, return_sequences=False),  # LSTM layer
    Dropout(0.5),                       # Dropout for regularization
    Dense(64, activation='relu'),       # Dense layer
    Dense(1, activation='sigmoid')      # Output layer for binary classification
])

# 4. Compile Model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 5. Train Model
history = model.fit(X_train, y_train, epochs=5, batch_size=16, validation_data=(X_test, y_test))

# 6. Evaluate Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# 7. Predict on New Data
new_texts = ["I absolutely love this!", "It was a terrible experience."]
new_sequences = tokenizer.texts_to_sequences(new_texts)
new_padded = pad_sequences(new_sequences, maxlen=50)
predictions = model.predict(new_padded)
print("Predictions:", ["positive" if p > 0.5 else "negative" for p in predictions])




Epoch 1/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6s/step - accuracy: 0.5000 - loss: 0.6950 - val_accuracy: 0.0000e+00 - val_loss: 0.7042
Epoch 2/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 236ms/step - accuracy: 0.7500 - loss: 0.6863 - val_accuracy: 0.0000e+00 - val_loss: 0.7127
Epoch 3/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - accuracy: 0.7500 - loss: 0.6787 - val_accuracy: 0.0000e+00 - val_loss: 0.7251
Epoch 4/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step - accuracy: 0.7500 - loss: 0.6728 - val_accuracy: 0.0000e+00 - val_loss: 0.7388
Epoch 5/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step - accuracy: 0.7500 - loss: 0.6585 - val_accuracy: 0.0000e+00 - val_loss: 0.7572
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.0000e+00 - loss: 0.7572
Test Accuracy: 0.00%
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

2. How to generate sequences of text using a Recurrent Neural Network (RNN)

In [2]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, LSTM
from tensorflow.keras.utils import to_categorical


In [3]:
# Example text data
text = "hello world, this is a simple text generation example using RNN."

# Create a mapping of characters to integers
chars = sorted(list(set(text)))
char_to_index = {c: i for i, c in enumerate(chars)}
index_to_char = {i: c for i, c in enumerate(chars)}

# Convert text into sequences of integers
sequence_length = 10
sequences = []
next_chars = []

for i in range(len(text) - sequence_length):
    sequences.append(text[i:i + sequence_length])
    next_chars.append(text[i + sequence_length])

# Encode sequences and next characters
X = np.zeros((len(sequences), sequence_length, len(chars)), dtype=np.bool_)
y = np.zeros((len(sequences), len(chars)), dtype=np.bool_)

for i, sequence in enumerate(sequences):
    for t, char in enumerate(sequence):
        X[i, t, char_to_index[char]] = 1
    y[i, char_to_index[next_chars[i]]] = 1


3. How to perform sentiment analysis using a simple CNN model

In [4]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [5]:
# Example dataset
texts = [
    "I love this product",
    "This is the worst experience",
    "Absolutely fantastic!",
    "Not worth the money",
    "I am very happy with this purchase"
]
labels = [1, 0, 1, 0, 1]  # 1: Positive, 0: Negative

# Tokenize the texts
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Pad sequences to ensure uniform length
max_sequence_length = 10
X = pad_sequences(sequences, maxlen=max_sequence_length)

# Convert labels to numpy array
y = np.array(labels)


In [6]:
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=max_sequence_length),
    Conv1D(filters=128, kernel_size=3, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(10, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [9]:
model.fit(X, y, epochs=5, batch_size=2)


Epoch 1/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 1.0000 - loss: 0.6279
Epoch 2/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 1.0000 - loss: 0.6098 
Epoch 3/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 1.0000 - loss: 0.5956 
Epoch 4/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 1.0000 - loss: 0.5685 
Epoch 5/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 1.0000 - loss: 0.5494 


<keras.src.callbacks.history.History at 0x7c742acf7940>

In [10]:
# New samples
test_texts = [
    "I hate this product",
    "Best experience ever!"
]
test_sequences = tokenizer.texts_to_sequences(test_texts)
X_test = pad_sequences(test_sequences, maxlen=max_sequence_length)

# Predict sentiment
predictions = model.predict(X_test)
print(["Positive" if pred > 0.5 else "Negative" for pred in predictions])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
['Positive', 'Positive']


4.  How to perform Named Entity Recognition (NER) using spaCy

In [13]:
!pip install spacy




In [15]:
import spacy

# Load pre-trained spaCy model
nlp = spacy.load("en_core_web_sm")

# Input text
text = "Barack Obama was the 44th President of the United States and he lives in Washington."

# Process the text
doc = nlp(text)

# Print named entities
print("Named Entities, Phrases, and Concepts:")
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")


Named Entities, Phrases, and Concepts:
Barack Obama (PERSON)
44th (ORDINAL)
the United States (GPE)
Washington (GPE)


5.  How to implement a simple Seq2Seq model for machine translation using LSTM in Keras

In [18]:
pip install tensorflow




In [25]:
import numpy as np
from keras.models import Model
from keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences


# Example data
input_texts = ["hello", "how are you", "good morning"]
target_texts = ["salut", "comment ça va", "bonjour"]

# Add start and end tokens for target texts
target_texts = ["\t" + text + "\n" for text in target_texts]

# Tokenize input and target texts
input_tokenizer = Tokenizer(char_level=True)
target_tokenizer = Tokenizer(char_level=True)

input_tokenizer.fit_on_texts(input_texts)
target_tokenizer.fit_on_texts(target_texts)

input_sequences = input_tokenizer.texts_to_sequences(input_texts)
target_sequences = target_tokenizer.texts_to_sequences(target_texts)

max_input_len = max(len(seq) for seq in input_sequences)
max_target_len = max(len(seq) for seq in target_sequences)

input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

# Pad sequences
encoder_input_data = pad_sequences(input_sequences, maxlen=max_input_len, padding="post")
decoder_input_data = pad_sequences(target_sequences, maxlen=max_target_len, padding="post")
decoder_target_data = np.zeros_like(decoder_input_data)

# Assuming the maximum sequence length for both encoder and decoder is max_sequence_length
max_sequence_length = 15

# Create the target data for decoder
# Create the target data for decoder
for i, seq in enumerate(target_sequences):
    # Ensure the sequence is padded to max_sequence_length
    padded_seq = seq + [0] * (max_sequence_length - len(seq))  # Pad with zeros to max length

    # Shift the sequence by one and assign it to the target data
    # This creates a "shifted" target sequence for the decoder
    decoder_target_data[i, 1:] = padded_seq[:-1]  # Shifting for the decoder target



In [26]:
# Encoder
encoder_inputs = Input(shape=(None,), name="encoder_inputs")
encoder_embedding = Embedding(input_vocab_size, 128, name="encoder_embedding")(encoder_inputs)
encoder_lstm = LSTM(128, return_state=True, name="encoder_lstm")
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,), name="decoder_inputs")
decoder_embedding = Embedding(target_vocab_size, 128, name="decoder_embedding")(decoder_inputs)
decoder_lstm = LSTM(128, return_sequences=True, return_state=True, name="decoder_lstm")
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

decoder_dense = Dense(target_vocab_size, activation="softmax", name="decoder_dense")
decoder_outputs = decoder_dense(decoder_outputs)

# Combine into a model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()


In [27]:
# Add an additional dimension to decoder target data for sparse_categorical_crossentropy
decoder_target_data = np.expand_dims(decoder_target_data, -1)

# Train
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=32,
    epochs=50,
    validation_split=0.2,
)


Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.0333 - loss: 2.9540 - val_accuracy: 0.4000 - val_loss: 2.9038
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 426ms/step - accuracy: 0.3000 - loss: 2.9026 - val_accuracy: 0.4000 - val_loss: 2.8589
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step - accuracy: 0.3000 - loss: 2.8552 - val_accuracy: 0.4000 - val_loss: 2.8013
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168ms/step - accuracy: 0.3000 - loss: 2.7957 - val_accuracy: 0.4000 - val_loss: 2.7190
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 270ms/step - accuracy: 0.3000 - loss: 2.7098 - val_accuracy: 0.4000 - val_loss: 2.5935
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step - accuracy: 0.3000 - loss: 2.5762 - val_accuracy: 0.4000 - val_loss: 2.4060
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7c7381839f60>

In [28]:
# Encoder model for inference
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder model for inference
decoder_state_input_h = Input(shape=(128,))
decoder_state_input_c = Input(shape=(128,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_lstm_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs
)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_lstm_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)


In [29]:
def decode_sequence(input_seq):
    # Encode the input as state vectors
    states_value = encoder_model.predict(input_seq)

    # Generate an empty target sequence
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = target_tokenizer.word_index["\t"]  # Start token

    # Sampling loop
    decoded_sentence = ""
    while True:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Get the predicted token index
        predicted_token_index = np.argmax(output_tokens[0, -1, :])
        predicted_token = target_tokenizer.index_word.get(predicted_token_index, "")

        if predicted_token == "\n" or len(decoded_sentence) > max_target_len:
            break

        decoded_sentence += predicted_token

        # Update the target sequence and states
        target_seq[0, 0] = predicted_token_index
        states_value = [h, c]

    return decoded_sentence


# Test on a new sentence
test_sentence = "hello"
test_seq = input_tokenizer.texts_to_sequences([test_sentence])
test_seq = pad_sequences(test_seq, maxlen=max_input_len, padding="post")
print("Input:", test_sentence)
print("Translation:", decode_sequence(test_seq))


Input: hello
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 731ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 703ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 149ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Translation: ssllut


6.  How to generate text using a pre-trained transformer model (GPT-2)

In [30]:
pip install transformers torch




In [31]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the pre-trained GPT-2 model and tokenizer
model_name = 'gpt2'  # You can also use 'gpt2-medium', 'gpt2-large', or 'gpt2-xl' for larger models
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Ensure the tokenizer is set to pad the sequences properly
tokenizer.pad_token = tokenizer.eos_token

# Prompt to generate text
prompt = "Once upon a time, in a faraway land,"

# Encode the input prompt text into tokens
input_ids = tokenizer.encode(prompt, return_tensors='pt')

# Generate text using the model
output = model.generate(input_ids,
                        max_length=100,    # Length of the generated text
                        num_return_sequences=1,  # Number of sequences to generate
                        no_repeat_ngram_size=2,  # Avoid repeating n-grams
                        top_k=50,   # Limits the sampling pool to the top k tokens
                        top_p=0.95,  # Nucleus sampling (limits to cumulative probability of top tokens)
                        temperature=0.7,  # Controls randomness (higher = more random)
                        do_sample=True,   # Sampling strategy, False would return the most likely sequence
                        pad_token_id=tokenizer.eos_token_id)

# Decode the generated tokens back to text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Once upon a time, in a faraway land, a young boy called Lourdes is sent to help him find his brother. After he learns that his father is dead, Lory is a stepchild to him. He is then sent back to his old life, where he finds his parents' house. When he goes back there, he meets the old man, who is not his mother. They soon find out that Lorica is still alive, and she, along with her brothers


7.  How to apply data augmentation for text in NLP

In [32]:
import nltk
from nltk.corpus import wordnet
nltk.download('wordnet')

def synonym_augmentation(sentence):
    words = sentence.split()
    new_sentence = []
    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms:
            # Choose the first synonym and replace the word
            new_word = synonyms[0].lemmas()[0].name()
            new_sentence.append(new_word)
        else:
            new_sentence.append(word)
    return ' '.join(new_sentence)

original_text = "The quick brown fox jumps over the lazy dog."
augmented_text = synonym_augmentation(original_text)
print(augmented_text)


[nltk_data] Downloading package wordnet to /root/nltk_data...


The quick brown fox jump over the lazy dog.


In [33]:
import random

def random_insertion(sentence):
    words = sentence.split()
    insert_words = ["very", "extremely", "really", "quite"]
    random_word = random.choice(insert_words)
    random_pos = random.randint(0, len(words))
    words.insert(random_pos, random_word)
    return ' '.join(words)

original_text = "The quick brown fox jumps over the lazy dog."
augmented_text = random_insertion(original_text)
print(augmented_text)


The quick brown fox jumps over the lazy dog. extremely


In [34]:
def random_deletion(sentence, p=0.2):
    words = sentence.split()
    if len(words) == 1:  # Don't delete if there's only one word
        return sentence
    new_words = [word for word in words if random.uniform(0, 1) > p]
    # Ensure at least one word is left
    if len(new_words) == 0:
        return random.choice(words)
    return ' '.join(new_words)

original_text = "The quick brown fox jumps over the lazy dog."
augmented_text = random_deletion(original_text)
print(augmented_text)


The quick brown fox over lazy dog.
