In [1]:
!pip install rapidfuzz -qq

In [2]:
import numpy as np
import pandas as pd
from rapidfuzz import fuzz

In [3]:
matchings = pd.read_csv('../input/test-dataset/test-matchings.csv', keep_default_na=False)
matchings['target_id'] = matchings.index

matchings.rename(columns={ 'title_translit': 'page_title_translit' }, inplace=True)

## Translation Combiner

### Caption Translation

In [4]:
%%time

matchings['caption_translation'] = ''

for i in range(6):
    part = pd.read_csv('../input/test-trans-caption-{0}-6/tran_title_0{1}.csv'.format(i + 1, i), keep_default_na=False)
    part['caption_translation'] = part['caption_translation'].str.strip()
    indices = part[(part['caption_translation'] != '')].index
    matchings.loc[indices, 'caption_translation'] = part.loc[indices, 'caption_translation']

CPU times: user 5.03 s, sys: 520 ms, total: 5.55 s
Wall time: 7.32 s


In [5]:
def CollectFinalColumn(field, source_column):
    final_field = 'final_' + field
    field_lang = field + '_lang'
    field_translit = field + '_translit'
    field_translation = field + '_translation'
        
    matchings[final_field] = matchings[field_translation]

    mask = (matchings[field_lang] == 'en') & (matchings[final_field] == '')
    matchings.loc[mask, final_field] = matchings.loc[mask, source_column]

    condition = (matchings[field_lang] != 'en') & (matchings[final_field] == '') & (matchings[field_translit] != '')
    matchings.loc[condition, final_field] = matchings.loc[condition, field_translit]
    
    PREfinal_field = 'PREfinal_' + field
    matchings[PREfinal_field] = matchings[field_translation]
    empty_indices = matchings[(matchings[PREfinal_field] == '') & (matchings[source_column] != '')].index
    matchings.loc[empty_indices, PREfinal_field] = matchings.loc[empty_indices, source_column]

In [6]:
CollectFinalColumn('caption', 'caption_reference_description')

In [7]:
matchings[(matchings['final_caption'] == '')]

Unnamed: 0,page_title,caption_reference_description,page_title_translit,caption_translit,page_title_lang,page_title_lang_p,page_title_en,caption_lang,caption_lang_p,caption_en,caption_contains_digit,undigit_caption,page_title_contains_digit,undigit_page_title,target,target_id,caption_translation,final_caption,PREfinal_caption
3021,Αέτωμα της Λέαινας,,Aetoma tis Leainas,,el,0.997957,False,en,0.124504,True,False,,False,Αέτωμα της Λέαινας,Αέτωμα της Λέαινας [SEP],3021,,,
9114,داوید لوییس,2015,,,fa,0.965861,False,fr,0.361018,False,True,,False,داوید لوییس,داوید لوییس [SEP] 2015,9114,,,2015
21059,North Carolina's 12th congressional district,2013–2017,,,en,0.817512,True,de,0.580778,False,True,–,True,North Carolina's th congressional district,North Carolina's 12th congressional district [...,21059,,,2013–2017
27712,এশিয়ান ইউনিভার্সিটি ফর উইমেন,২০১২,,,bn,0.664426,False,bn,0.982708,False,True,,False,এশিয়ান ইউনিভার্সিটি ফর উইমেন,এশিয়ান ইউনিভার্সিটি ফর উইমেন [SEP] ২০১২,27712,,,২০১২
29281,North Carolina's 7th congressional district,2013–2017,,,en,0.846864,True,de,0.580778,False,True,–,True,North Carolina's th congressional district,North Carolina's 7th congressional district [S...,29281,,,2013–2017
30692,Daerah Halland,,,,ms,0.923291,False,en,0.124504,True,False,,False,Daerah Halland,Daerah Halland [SEP],30692,,,
32487,Mahajana Law College,[1],,,en,0.547605,True,sh,0.395106,False,True,[],False,Mahajana Law College,Mahajana Law College [SEP] [1],32487,,,[1]
33731,Sofiane Harkat,2006,,,fr,0.357885,False,fr,0.241057,False,True,,False,Sofiane Harkat,Sofiane Harkat [SEP] 2006,33731,,,2006
40098,Schifffahrtsmuseum Stavanger,2010,,,de,0.760745,False,fr,0.237198,False,True,,False,Schifffahrtsmuseum Stavanger,Schifffahrtsmuseum Stavanger [SEP] 2010,40098,,,2010
45105,Mercedes Menafra,2005,,,en,0.152145,True,fr,0.332307,False,True,,False,Mercedes Menafra,Mercedes Menafra [SEP] 2005,45105,,,2005


### Page Title Translation

In [8]:
matchings['page_title_translation'] = ''

for i in range(6):
    part = pd.read_csv('../input/test-trans-page-title-{0}-6/tran_title_0{1}.csv'.format(i + 1, i), keep_default_na=False)
    part['page_title_translation'] = part['page_title_translation'].str.strip()
    indices = part[(part['page_title_translation'] != '')].index
    matchings.loc[indices, 'page_title_translation'] = part.loc[indices, 'page_title_translation']

In [9]:
CollectFinalColumn('page_title', 'page_title')

In [10]:
matchings[matchings['final_page_title'] == '']

Unnamed: 0,page_title,caption_reference_description,page_title_translit,caption_translit,page_title_lang,page_title_lang_p,page_title_en,caption_lang,caption_lang_p,caption_en,...,page_title_contains_digit,undigit_page_title,target,target_id,caption_translation,final_caption,PREfinal_caption,page_title_translation,final_page_title,PREfinal_page_title
25624,Alain Menu,Alain Menu pilotando su Renault Laguna de 1997...,,,eo,0.240302,False,es,0.737402,False,...,False,Alain Menu,Alain Menu [SEP] Alain Menu pilotando su Renau...,25624,Alain Menu driving his Renault Laguna de at th...,Alain Menu driving his Renault Laguna de at th...,Alain Menu driving his Renault Laguna de at th...,,,Alain Menu


In [12]:
matchings.columns

Index(['page_title', 'caption_reference_description', 'page_title_translit',
       'caption_translit', 'page_title_lang', 'page_title_lang_p',
       'page_title_en', 'caption_lang', 'caption_lang_p', 'caption_en',
       'caption_contains_digit', 'undigit_caption',
       'page_title_contains_digit', 'undigit_page_title', 'target',
       'target_id', 'caption_translation', 'final_caption', 'PREfinal_caption',
       'page_title_translation', 'final_page_title', 'PREfinal_page_title'],
      dtype='object')

In [13]:
fields = ['caption_reference_description', 'page_title', 'page_title_translit', 'caption_translit', \
          'final_caption', 'PREfinal_caption', 'PREfinal_page_title', 'final_page_title', \
          'caption_translation', 'page_title_translation']

for field in fields:
    matchings[field] = matchings[field].str.replace(r'\s{2,}', ' ', regex=True)

In [14]:
matchings.to_csv('final_matchings.csv', index=False)

In [15]:
matchings

Unnamed: 0,page_title,caption_reference_description,page_title_translit,caption_translit,page_title_lang,page_title_lang_p,page_title_en,caption_lang,caption_lang_p,caption_en,...,page_title_contains_digit,undigit_page_title,target,target_id,caption_translation,final_caption,PREfinal_caption,page_title_translation,final_page_title,PREfinal_page_title
0,Albert Pike,Albert Pike,,,en,0.124504,True,en,0.124504,True,...,False,Albert Pike,Albert Pike [SEP] Albert Pike,0,,Albert Pike,Albert Pike,,Albert Pike,Albert Pike
1,Anna Blount,"Blount and her young daughter Ruth, in 1911",,,en,0.739186,True,en,0.971129,True,...,False,Anna Blount,Anna Blount [SEP] Blount and her young daughte...,1,,"Blount and her young daughter Ruth, in 1911","Blount and her young daughter Ruth, in 1911",,Anna Blount,Anna Blount
2,Río Marañón,"Die Río Marañón in die Huánuco-streek, Peru",Rio Maranon,"Die Rio Maranon in die Huanuco-streek, Peru",es,0.684134,False,de,0.376005,False,...,False,Río Marañón,Río Marañón [SEP] Die Río Marañón in die Huánu...,2,"Die Río Marañón in die Huánuco-streek, Peru","Die Río Marañón in die Huánuco-streek, Peru","Die Río Marañón in die Huánuco-streek, Peru",Marañón River,Marañón River,Marañón River
3,Leonel Brizola,Brizola during his inauguration ceremony as go...,,,en,0.361589,True,en,0.997266,True,...,False,Leonel Brizola,Leonel Brizola [SEP] Brizola during his inaugu...,3,,Brizola during his inauguration ceremony as go...,Brizola during his inauguration ceremony as go...,,Leonel Brizola,Leonel Brizola
4,Buttisholz,Buttisholz,,,en,0.346513,True,en,0.346513,True,...,False,Buttisholz,Buttisholz [SEP] Buttisholz,4,,Buttisholz,Buttisholz,,Buttisholz,Buttisholz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92361,Essen (Haren),Essen,,,en,0.290392,True,en,0.124504,True,...,False,Essen (Haren),Essen (Haren) [SEP] Essen,92361,,Essen,Essen,,Essen (Haren),Essen (Haren)
92362,Mindszenty József,Boldog IV. Károly és felesége Zita királyné,Mindszenty Jozsef,Boldog IV. Karoly es felesege Zita kiralyne,de,0.358293,False,hu,0.987040,False,...,False,Mindszenty József,Mindszenty József [SEP] Boldog IV. Károly és f...,92362,Happy IV. Charles and his wife Queen Zita,Happy IV. Charles and his wife Queen Zita,Happy IV. Charles and his wife Queen Zita,József Mindszenty,József Mindszenty,József Mindszenty
92363,Пещеры Крыма,Вход в пещеру Суук-Коба (Холодная),Peschery Kryma,Vhod v pescheru Suuk-Koba (Holodnaja),ru,0.931901,False,ru,0.973208,False,...,False,Пещеры Крыма,Пещеры Крыма [SEP] Вход в пещеру Суук-Коба (Хо...,92363,Entrance to the Suuk-Koba cave (Cold),Entrance to the Suuk-Koba cave (Cold),Entrance to the Suuk-Koba cave (Cold),Crimea caves,Crimea caves,Crimea caves
92364,MOWAG Piranha,Usuaris del Piranha 5 en blau.,,,pt,0.832627,False,es,0.643328,False,...,False,MOWAG Piranha,MOWAG Piranha [SEP] Usuaris del Piranha 5 en b...,92364,Piranha users in blue.,Piranha users in blue.,Piranha users in blue.,MOWAG Piranha,MOWAG Piranha,MOWAG Piranha
