In [1]:
import pandas as pd 
import numpy as np 
import re

## Here we load the outcomes of all the model outputs attained till now (precisely step 4,5,6 of documentation)

In [2]:
fong1 = pd.read_csv('/content/french_gungbe_trials.csv')
fong2 = pd.read_csv('/content/french_fongbe_trial_seq2seq_salim.csv')
ewe   = pd.read_csv('/content/french_ewe_seq2seq_trials_3.csv')

## Two outcomes created for blending 
1. outdf1 consists of french to Gungbe (for Fongbe) and French to Ewe (for Ewe) datasets combined
2. outdf2 consists of French to Ewe (for Fongbe) and French to Ewe(for Ewe) combined

In [3]:
outdf1 = fong1.append(ewe)
outdf2 = fong2.append(ewe)
outdf1.shape,outdf2.shape

((5893, 2), (5893, 2))

## Length adjustment 
We can clearly see that there are some translations where the length exceeded beyond a certain limit and will result in lower F1 Rouge score

In [4]:
outdf1['length'] = outdf1['Target'].str.len()
outdf1.sort_values(by=['length'],ascending=False)

Unnamed: 0,ID,Target,length
402,ID_GjyJhPix,"É sixu kpn , é na hn , b un na hn , b un na tu...",877
2692,ID_vpPUcjYl,Enyi mxo vi mitn l tn l ye sixu e ye kpo o ji...,673
2308,ID_pUDQJiAX,Azn e w é ka sixu m è b wema evo l b m evo l b...,519
2844,ID_yiOnPRKU,N e è ò hwjij é ò hwjij 5 : 5 é ò ac e é ó na ...,511
1243,ID_VrPtMgdy,Xn okpo nu mi hwenu e ee e é emi emi n m ...,507
...,...,...,...
862,ID_PIlOyOMc,ɛ̃,2
2762,ID_wXIaEMNT,ʋu,2
956,ID_QsuRTtCf,ìn,2
1978,ID_jZrozQPI,Ɣe,2


In [5]:
outdf2['length'] = outdf2['Target'].str.len()
outdf2.sort_values(by=['length'],ascending=False)

Unnamed: 0,ID,Target,length
1840,ID_grHYjFYV,"Agban, nutikúnkpákpá ɖé dó yɛhwemɛ ɔ, wɛ nyí h...",895
2308,ID_pUDQJiAX,Nukɔnnukɔntɔn nukɔntɔn xá mɛ ɖevo lɛ ɖò wemaxi...,855
2844,ID_yiOnPRKU,"Tɛnkpɔn ɖokpo 5 : 5 ) Fí ɔ, fitɛn e é nɔ ɖò gb...",796
2727,ID_wXaZAYow,"Azɔ ɔ, nukunmɛ linlin lɛ nɔ gɔ́n nukúnnú ɖò mɛ...",779
2,ID_ACYgGXTq,Nù tɛ́ lɛ́ wɛ́ lɛ́ wɛ́ lɛ́ lɛ́ wɛ́ nɔ́ kàn nuk...,731
...,...,...,...
1044,ID_SbVfqwqq,Mai,3
2762,ID_wXIaEMNT,ʋu,2
1978,ID_jZrozQPI,Ɣe,2
862,ID_PIlOyOMc,ɛ̃,2


We adjust the length here. Any sentence having sequence length of above 130 will be splitted and duplicates removed. 
<br>
Also a similar treatment is done at character level where any word having more than 10 letters is discarded from the sentence 

In [6]:
def func(text,length):
    if length>130:
        splitted_text = text.split()
        words_rel = []
        for a in splitted_text:
            if len(a)<10:
                words_rel.append(a)
        return ' '.join(words_rel)
    else:
        return text

In [7]:
outdf1['Target'] = outdf1.apply(lambda z: z['Target'] if z['length']<130 else ' '.join(list(set(z['Target'].split()))),axis=1)
outdf2['Target'] = outdf2.apply(lambda z: z['Target'] if z['length']<130 else ' '.join(list(set(z['Target'].split()))),axis=1)

outdf1['new_length'] = outdf1['Target'].str.len()
outdf2['new_length'] = outdf2['Target'].str.len()


In [8]:
outdf1[outdf1['Target']==' ']

Unnamed: 0,ID,Target,length,new_length


In [9]:
outdf2[outdf2['Target']==' ']

Unnamed: 0,ID,Target,length,new_length


Both the dataframes are merged and the result is the unique set of words coming from both the translations-
1. French to Ewe for Fongbe
2. French to Gungbe for Fongbe

The French to Ewe translations remain consistent in both the above translations, so the blending won't affect any French to Ewe translations anyways.

In [10]:
newdf = outdf1.copy()
newdf = newdf.rename({'Target':'ewe_gungbe'},axis=1)
fdf = newdf.merge(outdf2,on=['ID'],how='inner')

In [11]:
fdf.head(2)

Unnamed: 0,ID,ewe_gungbe,length_x,new_length_x,Target,length_y,new_length_y
0,ID_AAGuzGzi,Msi axo en l bi na wa Beni bo na o xwe e xwe ...,86,86,Jikpamátɔ ɖaxo enɛ ɔ lɛ byɔ́ wá Beɛn bó ná xwl...,77,77
1,ID_AAuiTPkQ,En na wli m gbt l bo na jla gbm gbt l tn bo n...,95,95,Nùɖéwá élɔ́ ná zɔ́n bɔ̀ gbɛtɔ́ lɛ́ jɛ̀ gbɛtɔ́ ...,121,121


In [12]:
def blendedoutput(x,y):
  a = set(x.split())
  b = set(y.split())
  return ' '.join(list(set(a|b)))

In [13]:
fdf['Blending'] = fdf.apply(lambda z: blendedoutput(z['ewe_gungbe'],z['Target']),axis=1)

In [14]:
ndf = fdf[['ID','Blending']]

In [15]:
ndf.rename({'Blending':'Target'},axis=1)
ndf.to_csv('blending_trial.csv',index=False)

This is the final output and can be submitted.