In [1]:
import tensorflow as tf
import numpy as np
import random
import os
import pathlib
import unicodedata
import matplotlib.pyplot as plt


# Summarization
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download("punkt")
from nltk.tokenize import sent_tokenize


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
# Download dataset
url = "https://storage.googleapis.com/download.tensorflow.org/data/fra-eng.zip"
zip_path = tf.keras.utils.get_file(
    "fra-eng.zip",
    origin=url,
    extract=True
)

# Dynamically locate fra.txt
base_dir = pathlib.Path(zip_path).parent
fra_txt = None

for root, dirs, files in os.walk(base_dir):
    if "fra.txt" in files:
        fra_txt = pathlib.Path(root) / "fra.txt"
        break

if fra_txt is None:
    raise FileNotFoundError("fra.txt not found!")

print("Dataset found at:", fra_txt)


Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/fra-eng.zip
[1m3423204/3423204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Dataset found at: /root/.keras/datasets/fra-eng_extracted/fra.txt


In [3]:
def normalize(line):
    line = unicodedata.normalize("NFKC", line.strip().lower())
    eng, fre = line.split("\t")
    fre = "[start] " + fre + " [end]"
    return eng, fre


In [4]:
with open(fra_txt, encoding="utf-8") as f:
    text_pairs = [normalize(line) for line in f]

random.shuffle(text_pairs)
print("Total sentence pairs:", len(text_pairs))


Total sentence pairs: 167130


In [5]:
n_val = int(0.15 * len(text_pairs))
train_pairs = text_pairs[:-n_val]
val_pairs = text_pairs[-n_val:]


In [6]:
vocab_en = 10000
vocab_fr = 20000
seq_length = 25

eng_vect = tf.keras.layers.TextVectorization(
    max_tokens=vocab_en,
    split="whitespace",
    output_mode="int",
    output_sequence_length=seq_length
)

fre_vect = tf.keras.layers.TextVectorization(
    max_tokens=vocab_fr,
    split="whitespace",
    output_mode="int",
    output_sequence_length=seq_length + 1
)

eng_vect.adapt([p[0] for p in train_pairs])
fre_vect.adapt([p[1] for p in train_pairs])


In [7]:
def format_dataset(eng, fre):
    eng = eng_vect(eng)
    fre = fre_vect(fre)
    return (
        {"encode_inp": eng, "decode_inp": fre[:, :-1]},
        fre[:, 1:]
    )

def make_dataset(pairs, batch_size=64):
    eng, fre = zip(*pairs)
    ds = tf.data.Dataset.from_tensor_slices((list(eng), list(fre)))
    ds = ds.shuffle(2048).batch(batch_size)
    ds = ds.map(format_dataset, num_parallel_calls=tf.data.AUTOTUNE)
    return ds.prefetch(tf.data.AUTOTUNE)

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)


In [8]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, seq_len, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = tf.keras.layers.Embedding(
            vocab_size, embed_dim, mask_zero=True
        )

        pos = np.arange(seq_len)[:, None]
        i = np.arange(embed_dim)[None, :]
        angle = pos / np.power(10000, (2 * (i // 2)) / embed_dim)
        angle[:, 0::2] = np.sin(angle[:, 0::2])
        angle[:, 1::2] = np.cos(angle[:, 1::2])

        self.pos_emb = tf.constant(angle, dtype=tf.float32)

    def call(self, x):
        return self.token_emb(x) + self.pos_emb


In [9]:
class EncoderBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, heads, ff_dim):
        super().__init__()
        self.att = tf.keras.layers.MultiHeadAttention(heads, embed_dim)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(ff_dim, activation="relu"),
            tf.keras.layers.Dense(embed_dim)
        ])
        self.norm1 = tf.keras.layers.LayerNormalization()
        self.norm2 = tf.keras.layers.LayerNormalization()

    def call(self, x):
        x = self.norm1(x + self.att(x, x))
        return self.norm2(x + self.ffn(x))


In [10]:
class DecoderBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, heads, ff_dim):
        super().__init__()
        self.self_att = tf.keras.layers.MultiHeadAttention(heads, embed_dim)
        self.cross_att = tf.keras.layers.MultiHeadAttention(heads, embed_dim)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(ff_dim, activation="relu"),
            tf.keras.layers.Dense(embed_dim)
        ])
        self.norm1 = tf.keras.layers.LayerNormalization()
        self.norm2 = tf.keras.layers.LayerNormalization()
        self.norm3 = tf.keras.layers.LayerNormalization()

    def call(self, x, enc):
        x = self.norm1(x + self.self_att(x, x, use_causal_mask=True))
        x = self.norm2(x + self.cross_att(x, enc))
        return self.norm3(x + self.ffn(x))


In [11]:
def build_transformer():
    enc_in = tf.keras.Input(shape=(seq_length,), name="encode_inp")
    dec_in = tf.keras.Input(shape=(seq_length,), name="decode_inp")

    enc = PositionalEmbedding(seq_length, vocab_en, 128)(enc_in)
    dec = PositionalEmbedding(seq_length, vocab_fr, 128)(dec_in)

    for _ in range(4):
        enc = EncoderBlock(128, 4, 512)(enc)
        dec = DecoderBlock(128, 4, 512)(dec, enc)

    out = tf.keras.layers.Dense(vocab_fr)(dec)
    return tf.keras.Model([enc_in, dec_in], out)

model = build_transformer()


In [12]:
def masked_loss(y_true, y_pred):
    mask = tf.cast(y_true != 0, tf.float32)
    loss = tf.keras.losses.sparse_categorical_crossentropy(
        y_true, y_pred, from_logits=True
    )
    return tf.reduce_sum(loss * mask) / tf.reduce_sum(mask)

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),
    loss=masked_loss,
    metrics=["accuracy"]
)

model.summary()


In [13]:
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=5   # increase to 20 for final run
)


Epoch 1/5
[1m2220/2220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m189s[0m 68ms/step - accuracy: 0.0538 - loss: 6.5974 - val_accuracy: 0.1083 - val_loss: 4.0249
Epoch 2/5
[1m2220/2220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 61ms/step - accuracy: 0.1212 - loss: 3.7392 - val_accuracy: 0.1524 - val_loss: 2.9148
Epoch 3/5
[1m2220/2220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 61ms/step - accuracy: 0.1598 - loss: 2.7795 - val_accuracy: 0.1787 - val_loss: 2.3456
Epoch 4/5
[1m2220/2220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 60ms/step - accuracy: 0.1842 - loss: 2.2111 - val_accuracy: 0.1917 - val_loss: 1.9971
Epoch 5/5
[1m2220/2220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 61ms/step - accuracy: 0.1990 - loss: 1.8338 - val_accuracy: 0.2002 - val_loss: 1.7601


In [14]:
def translate(sentence):
    enc = eng_vect([sentence])
    result = ["[start]"]
    vocab = fre_vect.get_vocabulary()

    for _ in range(seq_length):
        dec = fre_vect([" ".join(result)])[:, :-1]
        preds = model([enc, dec])
        token = tf.argmax(preds[0, len(result)-1]).numpy()
        word = vocab[token]
        result.append(word)
        if word == "[end]":
            break

    return " ".join(result)


In [15]:
def summarize_text(text, num_sentences=2):
    sentences = sent_tokenize(text)
    if len(sentences) <= num_sentences:
        return text

    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf = vectorizer.fit_transform(sentences)
    scores = tfidf.sum(axis=1).A1

    ranked = sorted(
        ((score, sent) for score, sent in zip(scores, sentences)),
        reverse=True
    )

    selected = [s for _, s in ranked[:num_sentences]]
    return " ".join([s for s in sentences if s in selected])


In [19]:
import nltk

def ensure_nltk():
    for pkg in ["punkt", "punkt_tab"]:
        try:
            nltk.data.find(f"tokenizers/{pkg}")
        except LookupError:
            nltk.download(pkg)

ensure_nltk()


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [20]:
def summarize_and_translate(text):
    summary = summarize_text(text)
    translation = translate(summary)
    return summary, translation


In [21]:

e, f = random.choice(text_pairs)
print("English :", e)
print("Predicted:", translate(e))
print("Actual FR:", f)


English : tom shook my hand.
Predicted: [start] tom ma montré ma main end end end end end end end end end end end end end end end end end end end end
Actual FR: [start] tom me serra la main. [end]


In [22]:
text = """
Transformers are deep learning models that have revolutionized natural language processing.
They are widely used in translation and summarization tasks.
"""

summary, french = summarize_and_translate(text)
print("\nSUMMARY:\n", summary)
print("\nFRENCH TRANSLATION:\n", french)



SUMMARY:
 
Transformers are deep learning models that have revolutionized natural language processing.
They are widely used in translation and summarization tasks.


FRENCH TRANSLATION:
 [start] les [UNK] [UNK] [UNK] que la langue [UNK] [UNK] des [UNK] [UNK] sont des [UNK] et ils sont des [UNK] end end end end de


In [23]:
# Interactive English → French Translation
while True:
    user_input = input("\nEnter an English sentence (or type 'exit'): ")

    if user_input.lower() == "exit":
        print("Exiting translator.")
        break

    translation = translate(user_input)

    print("\nFrench Translation:")
    print(translation)



Enter an English sentence (or type 'exit'): hello 

French Translation:
[start] cest juste end end end end end end end end end end end end end end end end end end end end end end end

Enter an English sentence (or type 'exit'): hloo

French Translation:
[start] [UNK] end end end end end end end end end end end end end end end end end end end end end end end end

Enter an English sentence (or type 'exit'): hi this is bhavya

French Translation:
[start] ferme cest [UNK] end end end end end end end end end end end end end end end end end end end end end end

Enter an English sentence (or type 'exit'): exit
Exiting translator.


In [24]:
# Interactive Summarize + Translate
while True:
    user_input = input("\nEnter an English paragraph (or 'exit'): ")

    if user_input.lower() == "exit":
        print("Exiting summarizer.")
        break

    summary, french = summarize_and_translate(user_input)

    print("\nSummary:")
    print(summary)

    print("\nFrench Translation:")
    print(french)



Enter an English paragraph (or 'exit'): hi my name is sai im studying bttech

Summary:
hi my name is sai im studying bttech

French Translation:
[start] malheureusement mon nom est [UNK] [UNK] les [UNK] end end end end end end end end end end end end end end end end end

Enter an English paragraph (or 'exit'): exit
Exiting summarizer.
