In [1]:
! pip install transformers
! pip install sentencepiece



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import json
from collections import Counter
import pandas as pd

In [4]:
with open("/content/drive/My Drive/rsicd/dataset_rsicd.json") as f:
    ds = json.load(f)

In [5]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [6]:
# Storing tuples of image id and sentences in a list
all_sentences = []
for image in ds['images']:
    for sentence in image['sentences']:
        all_sentences.append((image['imgid'],sentence['raw']))

In [7]:
# Creating a counter to get counts of how many times each sentence was repeated in the same image
sentence_counts = dict(Counter(all_sentences))

In [8]:
# Converting the dictionary to a tuple -  (image_id, sentence, count)

image_id_sentences_and_counts = []
for sentence, count in sentence_counts.items():
    image_id_sentences_and_counts.append((sentence[0], sentence[1], count))

In [9]:
sentences_df = pd.DataFrame(image_id_sentences_and_counts, columns=['image_id', 'sentence', 'count'])

In [10]:
translation_languages_priority = ['fr','es','it', 'pt']

In [11]:
# Creating a dictionary with with languages to use for backtranslation
# eg. If the count = 2, one of them has to be backtranslated, and `fr` will be used for that.
languages_to_back_translate_with = {2 : translation_languages_priority[0],
                                    3 : translation_languages_priority[0:2],
                                    4 : translation_languages_priority[0:3],
                                    5 : translation_languages_priority}

In [12]:
# we dont need backtranslation if count = 1
sentences_df.drop(sentences_df[sentences_df['count'] == 1].index, inplace = True)

In [13]:
# Creating a column that has the languages to be used for running the backtranslation
sentences_df['languages_to_back_translate_with'] = sentences_df['count'].map(languages_to_back_translate_with)

In [14]:
# Creating a new row for each language
sentences_df = sentences_df.explode('languages_to_back_translate_with')

In [15]:
sentences_df = sentences_df.sort_values(by=['languages_to_back_translate_with'], ignore_index=True)

In [16]:
sentences_df.head()

Unnamed: 0,image_id,sentence,count,languages_to_back_translate_with
0,7525,a piece of open air pool in a bareland near .,3,es
1,2687,many buildings and green trees are in two side...,5,es
2,7228,some trees were planted around the railway sta...,3,es
3,2688,many buildings and green trees are in two side...,5,es
4,7227,there are two long black trains stopping in th...,3,es


## Translation Model.

In [17]:
from transformers import MarianMTModel, MarianTokenizer

In [18]:
target_model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
target_tokenizer = MarianTokenizer.from_pretrained(target_model_name)
target_model = MarianMTModel.from_pretrained(target_model_name)

In [19]:
en_model_name = 'Helsinki-NLP/opus-mt-ROMANCE-en'
en_tokenizer = MarianTokenizer.from_pretrained(en_model_name)
en_model = MarianMTModel.from_pretrained(en_model_name)


In [20]:
def translate(texts, model, tokenizer, language="fr"):
    # Prepare the text data into appropriate format for the model
    template = lambda text: f"{text}" if language == "en" else f">>{language}<< {text}"
    src_texts = [template(text) for text in texts]

    # Tokenize the texts
    encoded = tokenizer.prepare_seq2seq_batch(src_texts,return_tensors="pt")

    model.to(device)
    
    # Generate translation using model
    translated = model.generate(**encoded.to(device))

    # Convert the generated tokens indices back into text
    translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)
    
    return translated_texts

In [21]:
def back_translate(texts, source_lang="en", target_lang="fr"):
    # Translate from source to target language
    fr_texts = translate(texts, target_model, target_tokenizer, 
                         language=target_lang)

    # Translate from target language back to source language
    back_translated_texts = translate(fr_texts, en_model, en_tokenizer, 
                                      language=source_lang)
    
    return back_translated_texts

In [22]:
# Group the dataframe by language to translate with, so that they can be batched together.
grouped_by_language = sentences_df.groupby(['languages_to_back_translate_with'])

In [None]:
list_of_dataframes_for_each_lang = []
for lang in translation_languages_priority:
  df = grouped_by_language.get_group(lang)
  sentences = list(df['sentence'])

  #Batch the sentences in groups of 64
  list_of_sentence_batches = [sentences[i:i + 64] for i in range(0, len(sentences), 64)]
  
  # list to store all translated sentences.
  translated_sentences = []

  for sentence_batch in list_of_sentence_batches:    
    batch_of_translated_sentences = back_translate(sentence_batch, target_lang = lang)
    translated_sentences += batch_of_translated_sentences
  df["back_translated_sentence"] = translated_sentences
  list_of_dataframes_for_each_lang.append(df)

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [24]:
sentences_with_translation_df = pd.concat(list_of_dataframes_for_each_lang, axis=0)

In [25]:
sentences_with_translation_df.head()

Unnamed: 0,image_id,sentence,count,languages_to_back_translate_with,back_translated_sentence
740,928,the sea water is so transparent that it looks ...,2,fr,seawater is so transparent that it looks like ...
741,584,a lot of cars parked on the side of the land .,3,fr,many cars parked on the side of the earth.
742,938,a piece of green ocean is near a yellow beach .,5,fr,a piece of green ocean is near a yellow beach.
743,570,some plants are near a piece of khaki bareland .,2,fr,some plants are near a piece of kaki nueland.
744,599,the bare land has a small patch of water .,3,fr,The bare earth has a small piece of water.


In [62]:
# Creating a copy of the original json, for augmentation.
text_augmented_ds = ds

In [74]:
def find_sentence_in_img(text, sentences):
  for sent_index, sentence in enumerate(sentences):
    print(f"{text} --AND-- {sentence['raw']}")
    print(text == sentence['raw'])
    if text == sentence['raw']:
      print(sent_index)
      return sent_index


In [75]:
for i,row in enumerate(sentences_with_translation_df.itertuples()):
  for image_index, image in enumerate(text_augmented_ds['images']):
    if image['imgid'] ==  row.image_id:
      sent_index = find_sentence_in_img(row.sentence, image['sentences'])
      print(f"image index = {image_index}")
      print(f"sent index = {sent_index}")
      text_augmented_ds['images'][image_index]['sentences'][sent_index]['raw'] = row.back_translated_sentence


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
False
a white beach is near an ocean with many white waves . --AND-- a white beach is near an ocean with many white waves .
True
3
image index = 970
sent index = 3
it is a piece of gray bareland . --AND-- It's a gray naked piece of land.
False
it is a piece of gray bareland . --AND-- It's a piece of gray naked land.
False
it is a piece of gray bareland . --AND-- It's a gray naked piece of land.
False
it is a piece of gray bareland . --AND-- it is a piece of gray bareland .
True
3
image index = 498
sent index = 3
an airport is near some buildings and green trees . --AND-- an airport is located close to some buildings and green trees.
False
an airport is near some buildings and green trees . --AND-- an airport is close to some green buildings and trees.
False
an airport is near some buildings and green trees . --AND-- an airport is close to some green buildings and trees.
False
an airport is near some buildings and green tr

In [83]:
text_augmented_ds['images'][0]

{'filename': 'airport_1.jpg',
 'imgid': 0,
 'sentences': [{'imgid': 0,
   'raw': 'Many aircraft are parked next to a long building in an airport.',
   'sentid': 0,
   'tokens': ['many',
    'planes',
    'are',
    'parked',
    'next',
    'to',
    'a',
    'long',
    'building',
    'in',
    'an',
    'airport']},
  {'imgid': 0,
   'raw': 'Many planes are parked next to a long building at an airport.',
   'sentid': 1,
   'tokens': ['many',
    'planes',
    'are',
    'parked',
    'next',
    'to',
    'a',
    'long',
    'building',
    'in',
    'an',
    'airport']},
  {'imgid': 0,
   'raw': 'Many planes are parked next to a long building in an airport.',
   'sentid': 2,
   'tokens': ['many',
    'planes',
    'are',
    'parked',
    'next',
    'to',
    'a',
    'long',
    'building',
    'in',
    'an',
    'airport']},
  {'imgid': 0,
   'raw': 'many planes are parked next to a long building at an airport.',
   'sentid': 3,
   'tokens': ['many',
    'planes',
    'are',


In [60]:
sentences_with_translation_df.head()

Unnamed: 0,image_id,sentence,count,languages_to_back_translate_with,back_translated_sentence
740,928,the sea water is so transparent that it looks ...,2,fr,seawater is so transparent that it looks like ...
741,584,a lot of cars parked on the side of the land .,3,fr,many cars parked on the side of the earth.
742,938,a piece of green ocean is near a yellow beach .,5,fr,a piece of green ocean is near a yellow beach.
743,570,some plants are near a piece of khaki bareland .,2,fr,some plants are near a piece of kaki nueland.
744,599,the bare land has a small patch of water .,3,fr,The bare earth has a small piece of water.


In [84]:
sentences_with_translation_df[sentences_with_translation_df.sentence ==  'many planes are parked next to a long building in an airport .']

Unnamed: 0,image_id,sentence,count,languages_to_back_translate_with,back_translated_sentence
1396,0,many planes are parked next to a long building...,5,fr,Many aircraft are parked next to a long buildi...
677,0,many planes are parked next to a long building...,5,es,Many planes are parked next to a long building...
2013,0,many planes are parked next to a long building...,5,it,Many planes are parked next to a long building...
2586,0,many planes are parked next to a long building...,5,pt,many planes are parked next to a long building...
