<a href="https://colab.research.google.com/github/akankshakusf/Project-DeepLearning-English-to-French-Translation/blob/master/Neural_Machine_Translation_with_RNNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#import ML packages
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn.metrics import confusion_matrix,roc_curve
import pathlib
import io
import re
import string
import time

#import DL package
import cv2
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer,Dense, Flatten, InputLayer, BatchNormalization, Bidirectional, Dropout, Input, Embedding, TextVectorization
from tensorflow.keras.layers import SimpleRNN, Conv1D, LSTM, GRU
from tensorflow.keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy, TopKCategoricalAccuracy, TopKCategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from google.colab import drive
from google.colab import files
from tensorboard.plugins import projector

# Data Preparation

## Data Download

In [2]:
!wget https://www.manythings.org/anki/fra-eng.zip

--2025-04-13 11:33:02--  https://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7943074 (7.6M) [application/zip]
Saving to: ‘fra-eng.zip’


2025-04-13 11:33:05 (4.19 MB/s) - ‘fra-eng.zip’ saved [7943074/7943074]



In [3]:
!unzip "/content/fra-eng.zip" -d "/content/dataset/"

Archive:  /content/fra-eng.zip
  inflating: /content/dataset/_about.txt  
  inflating: /content/dataset/fra.txt  


## Kaggle Dataset

In [6]:
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d dhruvildave/en-fr-translation-dataset

Dataset URL: https://www.kaggle.com/datasets/dhruvildave/en-fr-translation-dataset
License(s): ODbL-1.0
en-fr-translation-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [7]:
!unzip "/content/en-fr-translation-dataset.zip" -d "/content/dataset/"

Archive:  /content/en-fr-translation-dataset.zip
  inflating: /content/dataset/en-fr.csv  


In [8]:
dataset = tf.data.experimental.CsvDataset(
  "/content/dataset/en-fr.csv",
  [
    tf.string,
    tf.string
  ],
)

## Data Processing

In [4]:
text_dataset=tf.data.TextLineDataset("/content/dataset/fra.txt")

In [5]:
#review dataset
for i in text_dataset.take(3):
  print(i)

tf.Tensor(b'Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)', shape=(), dtype=string)
tf.Tensor(b'Go.\tMarche.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)', shape=(), dtype=string)
tf.Tensor(b'Go.\tEn route !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8267435 (felix63)', shape=(), dtype=string)


In [6]:
# #lets skip a max number of records and check what max length we find
# for i in text_dataset.skip(190000):
#   print(len(tf.strings.split(i," ")))

- Since i saw that the max len of the sentence is 107. I am going to go ahead with a sequence length of 64 as we also have french letters

In [7]:
VOCAB_SIZE= 20000
SEQUENCE_LENGTH=64
EMBEDDING_DIM = 300

- Create vectorizer layer to create vectors
- reference :https://www.tensorflow.org/api_docs/python/tf/keras/layers/TextVectorization

In [8]:
english_vectorize_layer = TextVectorization (
    standardize='lower_and_strip_punctuation',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length = SEQUENCE_LENGTH
)

In [9]:
french_vectorize_layer = TextVectorization (
    standardize='lower_and_strip_punctuation',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length = SEQUENCE_LENGTH
)

- Look at this sample data from dataset and get rid of tabs--->  \t
'Go.\tVa !\tCC-BY 2.0 (France)

In [10]:
def selector (input_text):
  split_text=tf.strings.split(input_text, '\t')
  #after splitting collect english and french separately
  return split_text[0:1], split_text[1:2]

In [11]:
#map text_dataset to selector above function
split_dataset = text_dataset.map(selector)

In [12]:
# review the data
for i in split_dataset.take(2):
  print(i)

(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Va !'], dtype=object)>)
(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Marche.'], dtype=object)>)


- Notice how nicely english and french text have been separated now

In [None]:
# now lets attach this Vectorizer to split_dataset to get the vocabulary list
english_training_dataset = split_dataset.map(lambda x,y:x) ##input is x, y and output is x
english_vectorize_layer.adapt(english_training_dataset) ##adapth the vectorizer layer to training data

In [None]:
# now lets attach this Vectorizer to split_dataset to get the vocabulary list
french_training_dataset = split_dataset.map(lambda x,y:x) ##input is x, y and output is x
english_vectorize_layer.adapt(french_training_dataset) ##adapth the vectorizer layer to training data

# Modeling

# Evaluation

# Testing