## The code will run with following GPU configurations 

In [16]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Tue Jun  1 14:17:22 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    32W / 250W |  10001MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Updated libraries used in the code 

In [1]:
# !pip install --upgrade sentencepiece
# !pip install --upgrade datasets
# !pip install --upgrade transformers
# !pip install --upgrade rouge-score

In [2]:
import pandas as pd
import numpy as np
from datasets import *
from rouge_score import rouge_scorer

from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_metric


### Loading datasets
Load the Test.csv and Train.csv after downloading from Zindi after initializing the kernel.
<br>
Load the "french_fongbe_train_xlm_roberta.csv" and "french_fongbe_train_labse.csv" which was attained after running the code from step 3 of the documentation. These are going to be the additional data used to augment the given competition train data

In [3]:
test_data  = pd.read_csv('/content/Test.csv')
test_df    = test_data[test_data['Target_Language']=='Fon']

####### Loading existing train data and the extracted data from JW300
train_data  = pd.read_csv('/content/Train.csv')
train_data  = train_data[train_data['Target_Language']=='Fon']
train_data  = train_data.rename({'Target':'Fongbe'},axis=1)

extra_train = pd.read_csv('/content/french_fongbe_train_xlm_roberta.csv')

train_df    = train_data.append(extra_train).reset_index(drop=True)

#### train_df is the final train data to be used for training 

#### One of the datasets extracted form jw300 that will act as validation dataset
valid_df    = pd.read_csv('/content/french_fongbe_train_labse.csv')


test_df['Fongbe'] = ''

for cols in ['Fongbe','French']:
  train_df[cols] = train_df[cols].astype(str)
  valid_df[cols] = valid_df[cols].astype(str)
  test_df[cols]  = test_df[cols].astype(str)
print(train_df.shape,test_df.shape,valid_df.shape)

(55114, 4) (2929, 4) (2264, 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [4]:
train_df.head(3)

Unnamed: 0,ID,French,Target_Language,Fongbe
0,ID_AADNDxdl,Mon père,Fon,Tɔ ce
1,ID_AAFQhmDr,Mettez-vous en rang.,Fon,Mi tò miɖéé
2,ID_AAJfVHEH,Son doigt lui fait mal.,Fon,Alɔvi tɔn ɖo vivɛ wɛ


In [5]:
test_df.head(3)

Unnamed: 0,ID,French,Target_Language,Fongbe
1,ID_AAGuzGzi,Tous ces grands artistes viendront au Benin po...,Fon,
2,ID_AAuiTPkQ,Ce programme va travailler à améliorer les con...,Fon,
3,ID_ACYgGXTq,Quels sont les questions récurrentes de ceux ...,Fon,


## Convert the dataframes to Model readable format.

In [6]:
train_df['translation'] = train_df.apply(lambda x : {'fr':x['French'],'guw':x['Fongbe']}, axis=1)

test_df['translation']  = test_df.apply(lambda  x : {'fr':x['French'],'guw':x['Fongbe']}, axis=1)

valid_df['translation'] = valid_df.apply(lambda x : {'fr':x['French'],'guw':x['Fongbe']}, axis=1)

##### ids from test data to be used for indexing 
id2use                     = test_df['ID']


train                      = pd.DataFrame(train_df['translation'])

test                       = pd.DataFrame(valid_df['translation'])

test_dataframe             = pd.DataFrame(test_df['translation'],columns=['translation'])

#### raw dataset format
raw_datasets               = DatasetDict()
raw_datasets['train']      = Dataset.from_dict(train)
raw_datasets['validation'] = Dataset.from_dict(test)
raw_datasets['test']       = Dataset.from_dict(test_dataframe)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 55114
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2264
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2929
    })
})

## Loading the pretrained seq2seq model for the architecture.
<br>
We load the French to Gungbe pretrained model. This language was similar to Fongbe and for this language only a pretrained library was available.

In [8]:
metric           = load_metric("rouge")
model_checkpoint = "Helsinki-NLP/opus-mt-fr-guw"
tokenizer        = AutoTokenizer.from_pretrained(model_checkpoint)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2170.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1132.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=842533.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=708835.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1273441.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=43.0, style=ProgressStyle(description_w…




In [9]:
max_input_length = 256
max_target_length = 256
source_lang = "fr"
target_lang = "guw"
prefix      = ""
def preprocess_function(examples):
    inputs        = [prefix + ex[source_lang] for ex in examples["translation"]]

    targets       = [ex[target_lang] for ex in examples["translation"]]

    model_inputs  = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs


def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = metric.compute(predictions=pred_str, references=label_str, rouge_types=["rouge1"])
    rouge1 = rouge_output["rouge1"].mid
    

    return {
        "rouge1_precision": round(rouge1.precision, 4),
        "rouge1_recall": round(rouge1.recall, 4),
        "rouge1_fmeasure": round(rouge1.fmeasure, 4),
    }


tokenized_datasets = raw_datasets.map(preprocess_function, batched=True,num_proc=4)

    

HBox(children=(FloatProgress(value=0.0, description=' #0', max=14.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description=' #1', max=14.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description=' #2', max=14.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description=' #3', max=14.0, style=ProgressStyle(description_width='in…





    

HBox(children=(FloatProgress(value=0.0, description=' #0', max=1.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description=' #3', max=1.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description=' #1', max=1.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description=' #2', max=1.0, style=ProgressStyle(description_width='ini…





    

HBox(children=(FloatProgress(value=0.0, description=' #1', max=1.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description=' #0', max=1.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description=' #3', max=1.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description=' #2', max=1.0, style=ProgressStyle(description_width='ini…







## Model training 

In [10]:
model              = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=288513633.0, style=ProgressStyle(descri…




In [11]:
data_collator     = DataCollatorForSeq2Seq(tokenizer, model=model)

In [12]:
args = Seq2SeqTrainingArguments("test-translation", 
                                  evaluation_strategy           = "epoch", 
                                  learning_rate                 = 4.3401933791213136e-05,
                                  seed                          = 13,
                                  per_device_train_batch_size   = 32, 
                                  per_device_eval_batch_size    = 32, 
                                  weight_decay                  = 0.0043268807715410255,
                                  save_total_limit              = 3, 
                                  num_train_epochs              = 10,
                                  metric_for_best_model         = 'rouge1_fmeasure', 
                                  dataloader_num_workers        = 16, 
                                  predict_with_generate         = True, 
                                  fp16                          = False, 
                                  greater_is_better             = True,
                                  group_by_length               = True,
                                  # load_best_model_at_end        = True
                                  
                                  )

trainer = Seq2SeqTrainer(
                            model,
                            args,
                            train_dataset=tokenized_datasets["train"],
                            eval_dataset=tokenized_datasets["validation"],
                            data_collator=data_collator,
                            tokenizer=tokenizer,
                            compute_metrics=compute_metrics
                        )
trainer.train()

  cpuset_checked))


Epoch,Training Loss,Validation Loss,Rouge1 Precision,Rouge1 Recall,Rouge1 Fmeasure
1,2.4602,2.75745,0.2789,0.2132,0.2299
2,1.9383,2.519613,0.3036,0.2352,0.2513
3,1.5871,2.385218,0.3603,0.2574,0.2885
4,1.3884,2.307533,0.3578,0.2804,0.3035
5,1.2175,2.254773,0.3825,0.2783,0.3107
6,1.1032,2.224969,0.3817,0.2939,0.3204
7,1.0156,2.208242,0.3908,0.2957,0.3256
8,0.9475,2.195382,0.3931,0.3129,0.3368
9,0.891,2.186695,0.3991,0.3116,0.3387
10,0.8609,2.187003,0.4046,0.3123,0.3414


  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))


TrainOutput(global_step=17230, training_loss=1.4114181536266572, metrics={'train_runtime': 9194.1129, 'train_samples_per_second': 1.874, 'total_flos': 140627900160000.0, 'epoch': 10.0, 'init_mem_cpu_alloc_delta': 1739902976, 'init_mem_gpu_alloc_delta': 288460288, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': -33517568, 'train_mem_gpu_alloc_delta': 863959552, 'train_mem_cpu_peaked_delta': 174067712, 'train_mem_gpu_peaked_delta': 9995610112})

## Attaining Outputs

In [13]:
trainer.compute_metrics = None
test_dataloader         = trainer.get_test_dataloader(tokenized_datasets['test'])
b                       = trainer.prediction_loop(test_dataloader,description='Prediction')
labels                  = b.label_ids
preds                   = b.predictions
decoded_preds           = tokenizer.batch_decode(preds, skip_special_tokens=True)



In [14]:
sub          = pd.DataFrame(id2use,columns=['ID'])
sub['Target'] = decoded_preds
sub.head()

Unnamed: 0,ID,Target
1,ID_AAGuzGzi,Msi axo en l bi na wa Beni bo na o xwe e xwe ...
2,ID_AAuiTPkQ,En na wli m gbt l bo na jla gbm gbt l tn bo n...
3,ID_ACYgGXTq,"Nuxó t l w n è m e ò linlin m é l, m en l o n..."
5,ID_AFBqjFUm,"Azn tnweg, jn jn un j sin ji, b xwe o xwe sin..."
6,ID_AFFhTuyI,"Sín hwennu , m e n do hn , bo n do hln , bo n ..."


In [15]:
sub.to_csv("french_gungbe_trials.csv",index=False)

# Save the file attained above separately as explained in the documentation. It will be used for blending.