In [1]:
!pip install rapidfuzz -qq

In [2]:
import numpy as np
import pandas as pd
from rapidfuzz import fuzz

In [3]:
matchings = pd.read_csv('../input/traindataset-part0-4-count-5/matchings_part0_between4,5.csv', keep_default_na=False)
matchings['target_id'] = matchings.index

matchings.rename(columns={ 'title_translit': 'page_title_translit' }, inplace=True)

## Translation Combiner

### Caption Translation

In [4]:
%%time

matchings['caption_translation'] = ''

for i in range(24):
    part = pd.read_csv('../input/trans-cap-traindataset0-5-part{0}-24/tran_cap_part0_between4,5.{0}-of-24.csv'.format(i), keep_default_na=False)
    part['caption_translation'] = part['caption_translation'].str.strip()
    indices = part[(part['caption_translation'] != '')].index
    matchings.loc[indices, 'caption_translation'] = part.loc[indices, 'caption_translation']

CPU times: user 1min 29s, sys: 6.78 s, total: 1min 36s
Wall time: 2min 5s


In [5]:
def CollectFinalColumn(field, source_column):
    final_field = 'final_' + field
    field_lang = field + '_lang'
    field_translit = field + '_translit'
    field_translation = field + '_translation'
        
    matchings[final_field] = matchings[field_translation]

    mask = (matchings[field_lang] == 'en') & (matchings[final_field] == '')
    matchings.loc[mask, final_field] = matchings.loc[mask, source_column]

    condition = (matchings[field_lang] != 'en') & (matchings[final_field] == '') & (matchings[field_translit] != '')
    matchings.loc[condition, final_field] = matchings.loc[condition, field_translit]
    
    PREfinal_field = 'PREfinal_' + field
    matchings[PREfinal_field] = matchings[field_translation]
    empty_indices = matchings[(matchings[PREfinal_field] == '') & (matchings[source_column] != '')].index
    matchings.loc[empty_indices, PREfinal_field] = matchings.loc[empty_indices, source_column]

In [6]:
CollectFinalColumn('caption', 'caption_reference_description')

In [7]:
matchings[(matchings['final_caption'] == '')]

Unnamed: 0,image_url,page_title,caption_reference_description,count,spaced_filename,spaced_filename_translit,page_title_translit,caption_translit,page_title_lang,page_title_lang_p,...,caption_en,caption_contains_digit,undigit_caption,page_title_contains_digit,undigit_page_title,target,target_id,caption_translation,final_caption,PREfinal_caption
2658,https://upload.wikimedia.org/wikipedia/commons...,כורדיסטן הסורית,2015,4,Rojava june 2015,,,,he,0.999836,...,False,True,,False,כורדיסטן הסורית,כורדיסטן הסורית [SEP] 2015,2658,,,2015
16261,https://upload.wikimedia.org/wikipedia/commons...,Фрыда Пінта,2010,4,Freida Pinto Goa 2010 cropped,,Fryda Pіnta,,uk,0.696166,...,False,True,,False,Фрыда Пінта,Фрыда Пінта [SEP] 2010,16261,,,2010
18814,https://upload.wikimedia.org/wikipedia/commons...,Marc Madiot,2015,4,Barlin Quatre jours de Dunkerque étape 3 8 mai...,Barlin Quatre jours de Dunkerque etape 3 8 mai...,,,en,0.159361,...,False,True,,False,Marc Madiot,Marc Madiot [SEP] 2015,18814,,,2015
33585,https://upload.wikimedia.org/wikipedia/commons...,ილაი მენინგი,2012.,4,2012 Packers vs Giants Eli Manning 3,,ilai meningi,,ka,0.984755,...,False,True,.,False,ილაი მენინგი,ილაი მენინგი [SEP] 2012.,33585,,,2012.
34861,https://upload.wikimedia.org/wikipedia/commons...,نانا یوسلیانی,۱۹۸۰,4,Ioseliani Nana 1980 Malta,,,,fa,0.898179,...,False,True,,False,نانا یوسلیانی,نانا یوسلیانی [SEP] ۱۹۸۰,34861,,,۱۹۸۰
54668,https://upload.wikimedia.org/wikipedia/commons...,فردیناند لاسال,(‎۱۸۲۵–۱۸۶۴),5,Ferdinand Lassalle,,,,fa,0.777987,...,False,True,(‎–),False,فردیناند لاسال,فردیناند لاسال [SEP] (‎۱۸۲۵–۱۸۶۴),54668,,,(‎۱۸۲۵–۱۸۶۴)
61699,https://upload.wikimedia.org/wikipedia/commons...,क्रोएशियन भाषा,१२००,4,Vinodol,,,,hi,0.855799,...,False,True,,False,क्रोएशियन भाषा,क्रोएशियन भाषा [SEP] १२००,61699,,,१२००
67209,https://upload.wikimedia.org/wikipedia/commons...,قائمة الزيوت النباتية,....,4,Ceratonia siliqua green pods,,,,ar,0.997045,...,False,False,....,False,قائمة الزيوت النباتية,قائمة الزيوت النباتية [SEP] ....,67209,,,....
89237,https://upload.wikimedia.org/wikipedia/commons...,فیودور داستایفسکی,۱۸۷۶,5,Fyodor Mikhailovich Dostoyevsky 1876,,,,fa,0.933651,...,False,True,,False,فیودور داستایفسکی,فیودور داستایفسکی [SEP] ۱۸۷۶,89237,,,۱۸۷۶
114968,https://upload.wikimedia.org/wikipedia/commons...,安德烈·根纳季耶维奇·基里连科,2008,4,Dmitry Medvedev 29 July 2008 3,,,,zh,0.964157,...,False,True,,False,安德烈·根纳季耶维奇·基里连科,安德烈·根纳季耶维奇·基里连科 [SEP] 2008,114968,,,2008


### Page Title Translation

In [8]:
matchings['page_title_translation'] = ''

for i in range(21):
    part = pd.read_csv('../input/trans-title-traindataset0-5-part{0}-21/tran_title_part0_between4,5.{0}-of-21.csv'.format(i), keep_default_na=False)
    part['page_title_translation'] = part['page_title_translation'].str.strip()
    indices = part[(part['page_title_translation'] != '')].index
    matchings.loc[indices, 'page_title_translation'] = part.loc[indices, 'page_title_translation']

In [9]:
CollectFinalColumn('page_title', 'page_title')

In [10]:
matchings[matchings['final_page_title'] == '']

Unnamed: 0,image_url,page_title,caption_reference_description,count,spaced_filename,spaced_filename_translit,page_title_translit,caption_translit,page_title_lang,page_title_lang_p,...,page_title_contains_digit,undigit_page_title,target,target_id,caption_translation,final_caption,PREfinal_caption,page_title_translation,final_page_title,PREfinal_page_title
3899,https://upload.wikimedia.org/wikipedia/commons...,2010,Mary McKillop,4,Mary mackillop,,,,fr,0.237198,...,True,,2010 [SEP] Mary McKillop,3899,,Mary McKillop,Mary McKillop,,,2010
4607,https://upload.wikimedia.org/wikipedia/commons...,۲۰۱۵,سلیمان دمیرل,5,Suleyman Demirel 1998,,,,fa,0.999154,...,True,,۲۰۱۵ [SEP] سلیمان دمیرل,4607,Suleiman Demirel,Suleiman Demirel,Suleiman Demirel,,,۲۰۱۵
7101,https://upload.wikimedia.org/wikipedia/commons...,2014,Joe Cocker († 22. prosinec),4,Joe Cocker 03,,,,fr,0.423773,...,True,,2014 [SEP] Joe Cocker († 22. prosinec),7101,Joe Cocker († December),Joe Cocker († December),Joe Cocker († December),,,2014
7274,https://upload.wikimedia.org/wikipedia/commons...,2007,Ozzy Osbourne gir ut sitt tiende studioalbum B...,5,Ozzy Osbourne 2,,,,fr,0.321396,...,True,,2007 [SEP] Ozzy Osbourne gir ut sitt tiende st...,7274,Ozzy Osbourne releases his tenth studio album ...,Ozzy Osbourne releases his tenth studio album ...,Ozzy Osbourne releases his tenth studio album ...,,,2007
7885,https://upload.wikimedia.org/wikipedia/commons...,2014,Shirley Temple († 10. február),4,Temple Black 1990,,,Shirley Temple († 10. februar),fr,0.423773,...,True,,2014 [SEP] Shirley Temple († 10. február),7885,Shirley Temple († . február),Shirley Temple († . február),Shirley Temple († . február),,,2014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296125,http://upload.wikimedia.org/wikipedia/commons/...,0346,Mappa dei prefissi telefonici nella provincia ...,5,Prefissi provincia Bergamo,,,,de,0.130944,...,True,,0346 [SEP] Mappa dei prefissi telefonici nella...,296125,Map of telephone codes in the province of Bergamo,Map of telephone codes in the province of Bergamo,Map of telephone codes in the province of Bergamo,,,0346
296621,https://upload.wikimedia.org/wikipedia/commons...,1555,Joana de Castela,4,Meister der Magdalenenlegende 002,,,,ru,0.117649,...,True,,1555 [SEP] Joana de Castela,296621,Joana de Castilla,Joana de Castilla,Joana de Castilla,,,1555
298286,https://upload.wikimedia.org/wikipedia/commons...,2006,Den amerikanske verdensmesteren i tungvektsbok...,4,Floyd Patterson 1962 b,,,Den amerikanske verdensmesteren i tungvektsbok...,fr,0.241057,...,True,,2006 [SEP] Den amerikanske verdensmesteren i t...,298286,The American world champion in heavyweight box...,The American world champion in heavyweight box...,The American world champion in heavyweight box...,,,2006
299145,https://upload.wikimedia.org/wikipedia/commons...,2005,Ecclesia Dominae Nostrae Dresdensis iterum aed...,5,100130 150006 Dresden Frauenkirche winter blue...,,,,fr,0.332307,...,True,,2005 [SEP] Ecclesia Dominae Nostrae Dresdensis...,299145,,Ecclesia Dominae Nostrae Dresdensis iterum aed...,Ecclesia Dominae Nostrae Dresdensis iterum aed...,,,2005


In [12]:
matchings.columns

Index(['image_url', 'page_title', 'caption_reference_description', 'count',
       'spaced_filename', 'spaced_filename_translit', 'page_title_translit',
       'caption_translit', 'page_title_lang', 'page_title_lang_p',
       'page_title_en', 'caption_lang', 'caption_lang_p', 'caption_en',
       'caption_contains_digit', 'undigit_caption',
       'page_title_contains_digit', 'undigit_page_title', 'target',
       'target_id', 'caption_translation', 'final_caption', 'PREfinal_caption',
       'page_title_translation', 'final_page_title', 'PREfinal_page_title'],
      dtype='object')

In [13]:
fields = ['caption_reference_description', 'page_title', 'page_title_translit', 'caption_translit', \
          'final_caption', 'PREfinal_caption', 'PREfinal_page_title', 'final_page_title', \
          'caption_translation', 'page_title_translation']

for field in fields:
    matchings[field] = matchings[field].str.replace(r'\s{2,}', ' ', regex=True)

In [14]:
matchings.to_csv('final_matchings.csv', index=False)