In [None]:
!pip install datasets transformers[sentencepiece] sacrebleu -q flask_cors flask pyngrok

In [None]:
import os
import sys
import transformers
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay

In [None]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
raw_datasets = load_dataset("cfilt/iitb-english-hindi")

In [None]:
raw_datasets

In [None]:
# Increase the amount of training data (you can adjust the split as needed)
raw_datasets["train"] = raw_datasets["train"].shuffle(seed=42).select([i for i in range(10000)])

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]



In [None]:
# Set maximum input and target sequence lengths
max_input_length = 300
max_target_length = 300

In [None]:
# Define source and target languages
source_lang = "en"
target_lang = "hi"

In [None]:
# Preprocessing function
def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
# Apply preprocessing to the datasets
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]



Map:   0%|          | 0/520 [00:00<?, ? examples/s]

Map:   0%|          | 0/2507 [00:00<?, ? examples/s]

In [None]:
# Set batch size, learning rate, weight decay, and number of training epochs
batch_size = 16
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 10  # Increased number of epochs

In [None]:
# Create data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [None]:
# Prepare training and validation datasets
train_dataset = model.prepare_tf_dataset(
    tokenized_datasets["train"],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

validation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)


In [None]:
# Set up the optimizer
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer)


In [None]:
# Use early stopping during training
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True,
)

In [None]:
# Train the model
model.fit(
    train_dataset,
    validation_data=validation_dataset,
    epochs=num_train_epochs,
    callbacks=[early_stopping],
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
 63/625 [==>...........................] - ETA: 1:22:56 - loss: 2.2848

In [None]:
# Save the trained model
model.save_pretrained("tf_model/")

In [None]:
# Load the trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained("tf_model/")

In [None]:
# Test the model on a sample input sentence -----------------------NOT NEED TO RUN ANYMORE-------------------------------------------------
# input_text = "we are going tomorow"
# tokenized = tokenizer([input_text], return_tensors='tf')
# out = model.generate(**tokenized, max_length=128, num_beams=5)
# print(out)

# with tokenizer.as_target_tokenizer():
#     print(tokenizer.decode(out[0], skip_special_tokens=True))

In [None]:
from flask import Flask, request, jsonify
from pyngrok import ngrok

port_no = 3000
app = Flask(__name__)
ngrok.set_auth_token("2ZMVXjAeDwemDiXLDZnRMUWqzC4_i1FeHWujXivSyTS9oNzm")

# Disable CORS
app.config['CORS_HEADERS'] = 'Content-Type'

# Define CORS handler
@app.after_request
def after_request(response):
    response.headers.add('Access-Control-Allow-Origin', '*')
    response.headers.add('Access-Control-Allow-Headers', 'Content-Type,Authorization')
    response.headers.add('Access-Control-Allow-Methods', 'GET,PUT,POST,DELETE')
    return response

public_url = ngrok.connect(port_no).public_url

@app.route('/', methods=['POST'])
def post_request():
    try:
        data = request.json
        topic_value = data.get('topic', 'Topic not found')

        if topic_value:
            # Assuming tokenizer and model are initialized somewhere in your code
            input_text = topic_value
            tokenized = tokenizer([input_text], return_tensors='tf')
            out = model.generate(**tokenized, max_length=128, num_beams=5)

            with tokenizer.as_target_tokenizer():
                output_text = tokenizer.decode(out[0], skip_special_tokens=True)

            return jsonify({
                'success': True,
                'message': 'post request works properly',
                'topic': output_text,
            })
        else:
            return jsonify({
                'success': False,
                'message': 'Topic not provided in the request.',
            })
    except Exception as e:
        return jsonify({
            'success': False,
            'message': str(e),
        })

print(f"To access the Global link, please click {public_url}")
app.run(port=port_no)


To access the Global link, please click https://b3cd-35-204-175-10.ngrok-free.app
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:3000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
