In [1]:
!pip install transliterate

Collecting transliterate
  Downloading transliterate-1.10.2-py2.py3-none-any.whl (45 kB)
[K     |████████████████████████████████| 45 kB 469 kB/s 
Installing collected packages: transliterate
Successfully installed transliterate-1.10.2


In [2]:
import numpy as np
import pandas as pd

import urllib.parse

from transliterate import translit
import fasttext

In [3]:
ft_model = fasttext.load_model('../input/nlp-models/lid.176.ftz')



In [4]:
PART = 0
MIN, MAX = 4, 5

columns = ['page_title', 'image_url', 'caption_reference_description']

In [5]:
%%time

train0 = pd.read_csv('../input/wikipedia-image-caption/train-0000{}-of-00005.tsv'.format(PART), sep='\t', usecols=columns)

CPU times: user 2min 12s, sys: 11.4 s, total: 2min 23s
Wall time: 4min 30s


In [6]:
%%time

ok_images = train0[train0[columns].notnull().all(1)].copy()

CPU times: user 3.32 s, sys: 365 ms, total: 3.68 s
Wall time: 3.67 s


In [7]:
ok_images

Unnamed: 0,image_url,page_title,caption_reference_description
2,https://upload.wikimedia.org/wikipedia/commons...,"Deer Park, Wisconsin",Downtown Deer Park
5,https://upload.wikimedia.org/wikipedia/commons...,Jürgen Ovens,"Jürgen Ovens's Justitia, 1663-1665, Museumsber..."
7,http://upload.wikimedia.org/wikipedia/commons/...,Václav Vladivoj Tomek,Václav Vladivoj Tomek
11,https://upload.wikimedia.org/wikipedia/commons...,MV Agusta,1956 MV Agusta 250 Raid
14,https://upload.wikimedia.org/wikipedia/commons...,Andréi Amalrik,"Andréi Amalrik, 1976"
...,...,...,...
7412260,https://upload.wikimedia.org/wikipedia/commons...,Chris Pratt,Pratt tại buổi ra mắt phim Vệ binh dải Ngân hà...
7412266,https://upload.wikimedia.org/wikipedia/commons...,Mount Hood,Mount Hood en Mirrormar
7412267,https://upload.wikimedia.org/wikipedia/commons...,Parc animalier des monts de Guéret,Panneau touristique indiquant le parc à l'entr...
7412271,http://upload.wikimedia.org/wikipedia/commons/...,Odin,A plate from a Swedish Vendel era helmet featu...


In [8]:
def repair_filename(url):
    """
    Repairs the filename from the specified image url.
    """
    return urllib.parse.unquote('.'.join(url.split('/')[-1].split('.')[:-1]).replace('_', ' '))


def equip_df(df):
    df['filename'] = df['image_url'].apply(repair_filename)
    df["pured_filename"] = df['filename'].str.replace(r'[^\w\s]', ' ', regex=True).replace(r'\s{2,}', ' ', regex=True)

    df["spaced_filename"] = df["pured_filename"].str. \
        replace(r'([a-z])([A-Z])', r'\1 \2', regex=True). \
        replace(r'([а-я])([А-Я])', r'\1 \2', regex=True). \
        replace(r'([a-z])(\d)', r'\1 \2', regex=True). \
        replace(r'([а-я])(\d)', r'\1 \2', regex=True). \
        replace(r'(\d)([a-zA-Z])', r'\1 \2', regex=True). \
        replace(r'(\d)([а-яА-Я])', r'\1 \2', regex=True)
    
    df["undigit_filename"] = df['spaced_filename'].str.replace(r'[0-9]', '', regex=True)
    df['undigit_filename'] = df['undigit_filename'].str.replace(r'\s{2,}', ' ', regex=True)
    df['undigit_filename'] = df['undigit_filename'].str.strip()    
    
    df['undigit_filename_empty'] = np.logical_or(df['undigit_filename'].str.isspace().values, (df['undigit_filename'] == '').values)
    
    
def extract_lang(s: str):
    o = ft_model.predict(s)
    return o[0][0].split('__')[-1], o[1][0]


def transliterate(s: str):
    try:
        return translit(s, reversed=True)
    except:
        return ''

In [9]:
%%time

image_count = ok_images.groupby('image_url').size()

filtered = image_count[(MIN <= image_count) & (image_count <= MAX)]
images2train = pd.DataFrame({ 'image_url': filtered.index, 'count': filtered.values })

equip_df(images2train)
r = images2train['spaced_filename'].map(extract_lang)
images2train['filename_lang'], images2train['filename_lang_p'] = zip(*r)
images2train['filename_en'] = images2train['filename_lang'] == 'en'

images2train['section'] = images2train['image_url'].str.extract(r'wikipedia/([a-zA-Z]{1,})/')
images2train['commons'] = images2train['section'] == 'commons'
images2train['lang_ok'] = images2train['section'] == images2train['filename_lang']

images2train['spaced_filename_translit'] = images2train['spaced_filename'].map(transliterate)
images2train['spaced_filename_translited'] = images2train['spaced_filename_translit'] != ''

images2train['ext'] = images2train['image_url'].str.rsplit('.', 1).str[-1].str.lower()

images2train['filename_contains_digit'] = images2train['filename'].str.contains(r'\d', regex=True)

CPU times: user 28.3 s, sys: 254 ms, total: 28.5 s
Wall time: 28.5 s


In [10]:
images2train

Unnamed: 0,image_url,count,filename,pured_filename,spaced_filename,undigit_filename,undigit_filename_empty,filename_lang,filename_lang_p,filename_en,section,commons,lang_ok,spaced_filename_translit,spaced_filename_translited,ext,filename_contains_digit
0,http://upload.wikimedia.org/wikipedia/commons/...,4,Apoxyomenos Pio-Clementino Inv1185 n2,Apoxyomenos Pio Clementino Inv1185 n2,Apoxyomenos Pio Clementino Inv 1185 n 2,Apoxyomenos Pio Clementino Inv n,False,en,0.228692,True,commons,True,False,,False,jpg,True
1,http://upload.wikimedia.org/wikipedia/commons/...,4,Babybox - venkovní strana,Babybox venkovní strana,Babybox venkovní strana,Babybox venkovní strana,False,cs,0.985916,False,commons,True,False,Babybox venkovni strana,True,jpg,False
2,http://upload.wikimedia.org/wikipedia/commons/...,4,Betania royal portrait,Betania royal portrait,Betania royal portrait,Betania royal portrait,False,en,0.877587,True,commons,True,False,,False,jpg,False
3,http://upload.wikimedia.org/wikipedia/commons/...,5,BlochGlassHarmonica,BlochGlassHarmonica,Bloch Glass Harmonica,Bloch Glass Harmonica,False,en,0.228180,True,commons,True,False,,False,jpg,False
4,http://upload.wikimedia.org/wikipedia/commons/...,4,BolivianChilePowder2,BolivianChilePowder2,Bolivian Chile Powder 2,Bolivian Chile Powder,False,en,0.549355,True,commons,True,False,,False,jpg,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69475,https://upload.wikimedia.org/wikipedia/ja/c/c3...,4,The five professors of Tokyo Bible Semiary,The five professors of Tokyo Bible Semiary,The five professors of Tokyo Bible Semiary,The five professors of Tokyo Bible Semiary,False,en,0.291418,True,ja,False,False,,False,jpg,False
69476,https://upload.wikimedia.org/wikipedia/ru/7/7b...,4,Кунцево1940,Кунцево1940,Кунцево 1940,Кунцево,False,ru,0.547369,False,ru,False,True,Kuntsevo 1940,True,jpg,True
69477,https://upload.wikimedia.org/wikipedia/ru/9/95...,4,КомандирыПервойКоннойармии,КомандирыПервойКоннойармии,Командиры Первой Коннойармии,Командиры Первой Коннойармии,False,ru,0.990093,False,ru,False,True,Komandiry Pervoj Konnojarmii,True,jpg,False
69478,https://upload.wikimedia.org/wikipedia/ru/c/cb...,4,Politburo1934,Politburo1934,Politburo 1934,Politburo,False,fr,0.323129,False,ru,False,False,,False,jpg,True


In [11]:
matching_columns = ['image_url', 'page_title', 'caption_reference_description', 'count', 'spaced_filename', 'spaced_filename_translit', 'title_translit', 'caption_translit', \
                    'page_title_lang', 'page_title_lang_p', 'page_title_en', 'caption_lang', 'caption_lang_p', 'caption_en', \
                    'caption_contains_digit', 'undigit_caption', 'page_title_contains_digit', 'undigit_page_title',
                    'target']

train_matchings = pd.merge(ok_images, images2train, on='image_url')
train_matchings['target'] = train_matchings['page_title'] + ' [SEP] ' + train_matchings['caption_reference_description']

train_matchings['title_translit'] = train_matchings['page_title'].map(transliterate)
train_matchings['caption_translit'] = train_matchings['caption_reference_description'].map(transliterate)

r = train_matchings['page_title'].map(extract_lang)
train_matchings['page_title_lang'], train_matchings['page_title_lang_p'] = zip(*r)
train_matchings['page_title_en'] = train_matchings['page_title_lang'] == 'en'

r = train_matchings['caption_reference_description'].map(lambda o: o.replace('\n', ' ')).map(extract_lang)
train_matchings['caption_lang'], train_matchings['caption_lang_p'] = zip(*r)
train_matchings['caption_en'] = train_matchings['caption_lang'] == 'en'

train_matchings['caption_contains_digit'] = train_matchings['caption_reference_description'].str.contains(r'\d', regex=True)
train_matchings['undigit_caption'] = train_matchings['caption_reference_description'].str.replace(r'\d', '', regex=True).replace(r'\s{2,}', ' ', regex=True)

train_matchings['page_title_contains_digit'] = train_matchings['page_title'].str.contains(r'\d', regex=True)
train_matchings['undigit_page_title'] = train_matchings['page_title'].str.replace(r'\d', '', regex=True).replace(r'\s{2,}', ' ', regex=True)

train_matchings[matching_columns]

Unnamed: 0,image_url,page_title,caption_reference_description,count,spaced_filename,spaced_filename_translit,title_translit,caption_translit,page_title_lang,page_title_lang_p,page_title_en,caption_lang,caption_lang_p,caption_en,caption_contains_digit,undigit_caption,page_title_contains_digit,undigit_page_title,target
0,https://upload.wikimedia.org/wikipedia/commons...,Jürgen Ovens,"Jürgen Ovens's Justitia, 1663-1665, Museumsber...",4,Jurgen ovens justitia,,Juergen Ovens,"Juergen Ovens's Justitia, 1663-1665, Museumsbe...",en,0.124504,True,en,0.894902,True,True,"Jürgen Ovens's Justitia, -, Museumsberg Flensb...",False,Jürgen Ovens,"Jürgen Ovens [SEP] Jürgen Ovens's Justitia, 16..."
1,https://upload.wikimedia.org/wikipedia/commons...,Jürgen Ovens,"Jürgen Ovens: Justitia, \n 1663-1665, Museumsb...",4,Jurgen ovens justitia,,Juergen Ovens,"Juergen Ovens: Justitia, \n 1663-1665, Museums...",en,0.124504,True,en,0.588354,True,True,"Jürgen Ovens: Justitia, -, Museumsberg, Flensborg",False,Jürgen Ovens,"Jürgen Ovens [SEP] Jürgen Ovens: Justitia, \n ..."
2,https://upload.wikimedia.org/wikipedia/commons...,"Овенс, Юрген","Юрген Овенс Юстиция, 1663—1665, Музей Фленсбур...",4,Jurgen ovens justitia,,"Ovens, Jurgen","Jurgen Ovens Justitsija, 1663—1665, Muzej Flen...",ru,0.647271,False,ru,0.949836,False,True,"Юрген Овенс Юстиция, —, Музей Фленсбург. Овенс...",False,"Овенс, Юрген","Овенс, Юрген [SEP] Юрген Овенс Юстиция, 1663—1..."
3,https://upload.wikimedia.org/wikipedia/commons...,Jürgen Ovens,"Justitia aus Schloss Gottorf, um 1665, Museums...",4,Jurgen ovens justitia,,Juergen Ovens,,en,0.124504,True,de,0.544288,False,True,"Justitia aus Schloss Gottorf, um , Museumsberg...",False,Jürgen Ovens,Jürgen Ovens [SEP] Justitia aus Schloss Gottor...
4,https://upload.wikimedia.org/wikipedia/commons...,Andréi Amalrik,"Andréi Amalrik, 1976",4,Andrei Amalrik 1976,,Andrei Amalrik,"Andrei Amalrik, 1976",es,0.742067,False,es,0.742067,False,True,"Andréi Amalrik,",False,Andréi Amalrik,"Andréi Amalrik [SEP] Andréi Amalrik, 1976"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301659,https://upload.wikimedia.org/wikipedia/commons...,National Gallery Prague,Veletržní palác (the Trade Fair Palace) houses...,4,Praha Veletržní palác jih,Praha Veletržni palac jih,,Veletržni palac (the Trade Fair Palace) houses...,en,0.686658,True,en,0.582721,True,False,Veletržní palác (the Trade Fair Palace) houses...,False,National Gallery Prague,National Gallery Prague [SEP] Veletržní palác ...
301660,https://upload.wikimedia.org/wikipedia/commons...,San Diego Convention Center,View of the center from the San Diego Bay,4,San Diego Convention Center,,,,en,0.576476,True,en,0.851519,True,False,View of the center from the San Diego Bay,False,San Diego Convention Center,San Diego Convention Center [SEP] View of the ...
301661,https://upload.wikimedia.org/wikipedia/commons...,San Diego Convention Center,View of the center from the San Diego Bay,4,San Diego Convention Center,,,,en,0.576476,True,en,0.851519,True,False,View of the center from the San Diego Bay,False,San Diego Convention Center,San Diego Convention Center [SEP] View of the ...
301662,https://upload.wikimedia.org/wikipedia/commons...,1996 Republican National Convention,The San Diego Convention Center was the site o...,4,San Diego Convention Center,,,,en,0.748757,True,en,0.938726,True,True,The San Diego Convention Center was the site o...,True,Republican National Convention,1996 Republican National Convention [SEP] The ...


In [12]:
train_matchings[matching_columns].to_csv('matchings_part{}_between{},{}.csv'.format(PART, MIN, MAX), index=False)
images2train.to_csv('images_part{}_between{},{}.csv'.format(PART, MIN, MAX), index=False)