## The code will run with following GPU configurations 

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Tue Jun  1 14:17:25 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Updated libraries used in the code 

In [1]:
# !pip install --upgrade sentencepiece
# !pip install --upgrade datasets
# !pip install --upgrade transformers
# !pip install --upgrade rouge-score

In [2]:
import pandas as pd
import numpy as np
from datasets import *
from rouge_score import rouge_scorer

from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_metric


### Loading datasets
Load the Test.csv and Train.csv after downloading from Zindi after initializing the kernel.
<br>
Load the "french_fongbe_train_xlm_roberta.csv" and "french_fongbe_train_labse.csv" which was attained after running the code from step 3 of the documentation. These are going to be the additional data used to augment the given competition train data

In [3]:
test_data  = pd.read_csv('Test.csv')
test_df    = test_data[test_data['Target_Language']=='Ewe']

####### Loading existing train data and the extracted data from JW300
train_data  = pd.read_csv('Train.csv')
train_data  = train_data[train_data['Target_Language']=='Ewe']
train_data  = train_data.rename({'Target':'Ewe'},axis=1)

extra_train = pd.read_csv('/content/french_ewe_train_xlm_roberta.csv')

train_df    = train_data.append(extra_train).reset_index(drop=True)

#### train_df is the final train data to be used for training 

#### One of the datasets extracted form jw300 that will act as validation dataset
valid_df    = pd.read_csv('/content/french_ewe_valid_xlm_roberta.csv')


test_df['Ewe'] = ''

for cols in ['Ewe','French']:
  train_df[cols] = train_df[cols].astype(str)
  valid_df[cols] = valid_df[cols].astype(str)
  test_df[cols]  = test_df[cols].astype(str)
print(train_df.shape,test_df.shape,valid_df.shape)

(25090, 4) (2964, 4) (2767, 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [4]:
train_df.head(3)

Unnamed: 0,ID,French,Target_Language,Ewe
0,ID_AAHVDMdq,"Sénégal, Côte d'Ivoire, Guinée, Ghana, on déco...",Ewe,"Sénégal, Côte d'Ivoire, Guinée, Ghana, siwo ƒe..."
1,ID_AARXSjjg,Janot se prit à grelotter dès que le soleil se...,Ewe,Yano dze ƒoƒo esi me ɣe gbe ɖo to eye ya dze ƒ...
2,ID_AAmSrrNh,"Et cela en une journée, sinon rien à manger.",Ewe,"Nawɔe le ŋkekea me. Nemenyυo oa, atdi adɔ"


In [5]:
test_df.head(3)

Unnamed: 0,ID,French,Target_Language,Ewe
0,ID_AAAAhgRX,Très fière d’elle,Ewe,
4,ID_AChdWHyF,Grosse bagnolle,Ewe,
11,ID_AHBSoUNL,Les seins comme ça… » Basta,Ewe,


## Convert the dataframes to Model readable format.

In [6]:
train_df['translation'] = train_df.apply(lambda x : {'fr':x['French'],'ee':x['Ewe']}, axis=1)

test_df['translation']  = test_df.apply(lambda  x : {'fr':x['French'],'ee':x['Ewe']}, axis=1)

valid_df['translation'] = valid_df.apply(lambda x : {'fr':x['French'],'ee':x['Ewe']}, axis=1)

##### ids from test data to be used for indexing 
id2use                     = test_df['ID']


train                      = pd.DataFrame(train_df['translation'])

test                       = pd.DataFrame(valid_df['translation'])

test_dataframe             = pd.DataFrame(test_df['translation'],columns=['translation'])

#### raw dataset format
raw_datasets               = DatasetDict()
raw_datasets['train']      = Dataset.from_dict(train)
raw_datasets['validation'] = Dataset.from_dict(test)
raw_datasets['test']       = Dataset.from_dict(test_dataframe)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 25090
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2767
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2964
    })
})

## Loading the pretrained seq2seq model for the architecture.
<br>
We load the French to Ewe pretrained model. 

In [8]:
metric           = load_metric("rouge")
model_checkpoint = "Helsinki-NLP/opus-mt-fr-ee"
tokenizer        = AutoTokenizer.from_pretrained(model_checkpoint)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2170.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1132.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=844820.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=831831.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1535077.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=42.0, style=ProgressStyle(description_w…




In [9]:
max_input_length = 96
max_target_length = 96
source_lang = "fr"
target_lang = "ee"
prefix      = ""
def preprocess_function(examples):
    inputs        = [prefix + ex[source_lang] for ex in examples["translation"]]

    targets       = [ex[target_lang] for ex in examples["translation"]]

    model_inputs  = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs


def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = metric.compute(predictions=pred_str, references=label_str, rouge_types=["rouge1"])
    rouge1 = rouge_output["rouge1"].mid
    

    return {
        "rouge1_precision": round(rouge1.precision, 4),
        "rouge1_recall": round(rouge1.recall, 4),
        "rouge1_fmeasure": round(rouge1.fmeasure, 4),
    }


tokenized_datasets = raw_datasets.map(preprocess_function, batched=True,num_proc=4)

    

HBox(children=(FloatProgress(value=0.0, description=' #0', max=7.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description=' #1', max=7.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description=' #3', max=7.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description=' #2', max=7.0, style=ProgressStyle(description_width='ini…





    

HBox(children=(FloatProgress(value=0.0, description=' #1', max=1.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description=' #3', max=1.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description=' #2', max=1.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description=' #0', max=1.0, style=ProgressStyle(description_width='ini…





    

HBox(children=(FloatProgress(value=0.0, description=' #2', max=1.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description=' #0', max=1.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description=' #1', max=1.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description=' #3', max=1.0, style=ProgressStyle(description_width='ini…







In [10]:
model              = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=302272293.0, style=ProgressStyle(descri…




In [11]:
data_collator     = DataCollatorForSeq2Seq(tokenizer, model=model)

In [12]:
args = Seq2SeqTrainingArguments("test-translation", 
                                  evaluation_strategy           = "epoch", 
                                  learning_rate                 = 4.3401933791213136e-05,
                                  seed                          = 13,
                                  per_device_train_batch_size   = 30, 
                                  per_device_eval_batch_size    = 32, 
                                  save_total_limit              = 3, 
                                  num_train_epochs              = 5,
                                  metric_for_best_model         = 'rouge1_fmeasure', 
                                  dataloader_num_workers        = 16, 
                                  predict_with_generate         = True, 
                                  fp16                          = False, 
                                  greater_is_better             = True,
                                  group_by_length               = True,
                                  
                                  )

trainer = Seq2SeqTrainer(
                            model,
                            args,
                            train_dataset=tokenized_datasets["train"],
                            eval_dataset=tokenized_datasets["validation"],
                            data_collator=data_collator,
                            tokenizer=tokenizer,
                            compute_metrics=compute_metrics
                        )
trainer.train()

  cpuset_checked))


Epoch,Training Loss,Validation Loss,Rouge1 Precision,Rouge1 Recall,Rouge1 Fmeasure
1,4.1026,1.831587,0.5211,0.4868,0.4923
2,3.2578,1.781895,0.5196,0.4914,0.494
3,2.9428,1.774898,0.5187,0.4909,0.4937
4,2.7699,1.778202,0.5195,0.4918,0.4947
5,2.6596,1.779469,0.5191,0.4919,0.4943


  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))


TrainOutput(global_step=4185, training_loss=3.1180448944565784, metrics={'train_runtime': 2539.5164, 'train_samples_per_second': 1.648, 'total_flos': 140347109376000.0, 'epoch': 5.0, 'init_mem_cpu_alloc_delta': 1710899200, 'init_mem_gpu_alloc_delta': 302624256, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 29675520, 'train_mem_gpu_alloc_delta': 905149952, 'train_mem_cpu_peaked_delta': 778240, 'train_mem_gpu_peaked_delta': 5245728768})

## Attaining Outputs

In [13]:
trainer.compute_metrics = None
test_dataloader         = trainer.get_test_dataloader(tokenized_datasets['test'])
b                       = trainer.prediction_loop(test_dataloader,description='Prediction')
labels                  = b.label_ids
preds                   = b.predictions
decoded_preds           = tokenizer.batch_decode(preds, skip_special_tokens=True)



In [14]:
sub          = pd.DataFrame(id2use,columns=['ID'])
sub['Target'] = decoded_preds
sub.head()

Unnamed: 0,ID,Target
0,ID_AAAAhgRX,ekpɔ dzidzɔ ɖe eŋu ale gbegbe
4,ID_AChdWHyF,yaʋalanuŋlɔla
11,ID_AHBSoUNL,nowo le alea. Basta
14,ID_AHycIkQv,Nocrese hã xlẽa nu
16,ID_AIWTdKBT,"kpɔ nukokui kple dzidzɔ, ke hã zi geɖe la, ewɔ..."


In [15]:
sub.to_csv("french_ewe_seq2seq_trials_3.csv",index=False)

In [16]:
sub.shape,test_df.shape

((2964, 2), (2964, 5))

# Save the file attained above separately as explained in the documentation. It will be used for blending.