In [1]:
!pip install transliterate

Collecting transliterate
  Downloading transliterate-1.10.2-py2.py3-none-any.whl (45 kB)
[K     |████████████████████████████████| 45 kB 262 kB/s 
Installing collected packages: transliterate
Successfully installed transliterate-1.10.2


In [2]:
import numpy as np
import pandas as pd

import urllib.parse

from transliterate import translit
import fasttext

In [3]:
ft_model = fasttext.load_model('../input/nlp-models/lid.176.ftz')



In [4]:
ok_images = pd.read_csv('../input/wikipedia-image-caption/test.tsv', sep='\t')

ok_images

Unnamed: 0,id,image_url
0,0,https://upload.wikimedia.org/wikipedia/commons...
1,1,https://upload.wikimedia.org/wikipedia/commons...
2,2,https://upload.wikimedia.org/wikipedia/commons...
3,3,https://upload.wikimedia.org/wikipedia/commons...
4,4,https://upload.wikimedia.org/wikipedia/commons...
...,...,...
92361,92361,https://upload.wikimedia.org/wikipedia/commons...
92362,92362,https://upload.wikimedia.org/wikipedia/commons...
92363,92363,https://upload.wikimedia.org/wikipedia/commons...
92364,92364,https://upload.wikimedia.org/wikipedia/commons...


In [5]:
matchings = pd.read_csv('../input/wikipedia-image-caption/test_caption_list.csv')

splits = matchings['caption_title_and_reference_description'].str.split(r'(.+)\[SEP\](.+)')
empty, article_title, image_desc, empty = zip(*splits)

matchings['page_title'] = article_title
matchings['caption_reference_description'] = image_desc

matchings['page_title'] = matchings['page_title'].str.strip()
matchings['caption_reference_description'] = matchings['caption_reference_description'].str.strip()

matchings

Unnamed: 0,caption_title_and_reference_description,page_title,caption_reference_description
0,Albert Pike [SEP] Albert Pike,Albert Pike,Albert Pike
1,Anna Blount [SEP] Blount and her young daughte...,Anna Blount,"Blount and her young daughter Ruth, in 1911"
2,Río Marañón [SEP] Die Río Marañón in die Huánu...,Río Marañón,"Die Río Marañón in die Huánuco-streek, Peru"
3,Leonel Brizola [SEP] Brizola during his inaugu...,Leonel Brizola,Brizola during his inauguration ceremony as go...
4,Buttisholz [SEP] Buttisholz,Buttisholz,Buttisholz
...,...,...,...
92361,Essen (Haren) [SEP] Essen,Essen (Haren),Essen
92362,Mindszenty József [SEP] Boldog IV. Károly és f...,Mindszenty József,Boldog IV. Károly és felesége Zita királyné
92363,Пещеры Крыма [SEP] Вход в пещеру Суук-Коба (Хо...,Пещеры Крыма,Вход в пещеру Суук-Коба (Холодная)
92364,MOWAG Piranha [SEP] Usuaris del Piranha 5 en b...,MOWAG Piranha,Usuaris del Piranha 5 en blau.


In [6]:
def repair_filename(url):
    """
    Repairs the filename from the specified image url.
    """
    return urllib.parse.unquote('.'.join(url.split('/')[-1].split('.')[:-1]).replace('_', ' '))


def equip_df(df):
    df['filename'] = df['image_url'].apply(repair_filename)
    df["pured_filename"] = df['filename'].str.replace(r'[^\w\s]', ' ', regex=True).replace(r'\s{2,}', ' ', regex=True)

    df["spaced_filename"] = df["pured_filename"].str. \
        replace(r'([a-z])([A-Z])', r'\1 \2', regex=True). \
        replace(r'([а-я])([А-Я])', r'\1 \2', regex=True). \
        replace(r'([a-z])(\d)', r'\1 \2', regex=True). \
        replace(r'([а-я])(\d)', r'\1 \2', regex=True). \
        replace(r'(\d)([a-zA-Z])', r'\1 \2', regex=True). \
        replace(r'(\d)([а-яА-Я])', r'\1 \2', regex=True)
    
    df["undigit_filename"] = df['spaced_filename'].str.replace(r'[0-9]', '', regex=True)
    df['undigit_filename'] = df['undigit_filename'].str.replace(r'\s{2,}', ' ', regex=True)
    df['undigit_filename'] = df['undigit_filename'].str.strip()    
    
    df['undigit_filename_empty'] = np.logical_or(df['undigit_filename'].str.isspace().values, (df['undigit_filename'] == '').values)
    
    
def extract_lang(s: str):
    o = ft_model.predict(s)
    return o[0][0].split('__')[-1], o[1][0]


def transliterate(s: str):
    try:
        return translit(s, reversed=True)
    except:
        return ''

In [7]:
%%time

image_count = ok_images.groupby('image_url').size()

filtered = image_count
images2train = pd.DataFrame({ 'image_url': filtered.index, 'count': filtered.values })

equip_df(images2train)
r = images2train['spaced_filename'].map(extract_lang)
images2train['filename_lang'], images2train['filename_lang_p'] = zip(*r)
images2train['filename_en'] = images2train['filename_lang'] == 'en'

images2train['section'] = images2train['image_url'].str.extract(r'wikipedia/([a-zA-Z]{1,})/')
images2train['commons'] = images2train['section'] == 'commons'
images2train['lang_ok'] = images2train['section'] == images2train['filename_lang']

images2train['spaced_filename_translit'] = images2train['spaced_filename'].map(transliterate)
images2train['spaced_filename_translited'] = images2train['spaced_filename_translit'] != ''

images2train['ext'] = images2train['image_url'].str.rsplit('.', 1).str[-1].str.lower()

images2train['filename_contains_digit'] = images2train['filename'].str.contains(r'\d', regex=True)

CPU times: user 11.9 s, sys: 43 ms, total: 12 s
Wall time: 12 s


In [8]:
images2train

Unnamed: 0,image_url,count,filename,pured_filename,spaced_filename,undigit_filename,undigit_filename_empty,filename_lang,filename_lang_p,filename_en,section,commons,lang_ok,spaced_filename_translit,spaced_filename_translited,ext,filename_contains_digit
0,http://upload.wikimedia.org/wikipedia/commons/...,1,"Billingsgate Cartographer; Blome, RichardSurve...",Billingsgate Cartographer Blome RichardSurveyo...,Billingsgate Cartographer Blome Richard Survey...,Billingsgate Cartographer Blome Richard Survey...,False,en,0.413867,True,commons,True,False,,False,jpg,True
1,http://upload.wikimedia.org/wikipedia/commons/...,1,"Fairbanks, fur market (02 cut)",Fairbanks fur market 02 cut,Fairbanks fur market 02 cut,Fairbanks fur market cut,False,en,0.868023,True,commons,True,False,,False,jpg,True
2,http://upload.wikimedia.org/wikipedia/commons/...,1,Giro Media Blenio 1,Giro Media Blenio 1,Giro Media Blenio 1,Giro Media Blenio,False,nl,0.310636,False,commons,True,False,,False,jpg,True
3,http://upload.wikimedia.org/wikipedia/commons/...,2,Guetenbachtor Lainzer Tiergarten,Guetenbachtor Lainzer Tiergarten,Guetenbachtor Lainzer Tiergarten,Guetenbachtor Lainzer Tiergarten,False,nl,0.365448,False,commons,True,False,,False,jpg,False
4,http://upload.wikimedia.org/wikipedia/commons/...,1,Iron Maiden's Eddie,Iron Maiden s Eddie,Iron Maiden s Eddie,Iron Maiden s Eddie,False,en,0.354466,True,commons,True,False,,False,jpg,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44757,https://upload.wikimedia.org/wikipedia/vi/2/2f...,1,Princess kaiulani,Princess kaiulani,Princess kaiulani,Princess kaiulani,False,en,0.396447,True,vi,False,False,,False,jpg,False
44758,https://upload.wikimedia.org/wikipedia/vi/5/52...,1,Thanhongdem1,Thanhongdem1,Thanhongdem 1,Thanhongdem,False,en,0.400808,True,vi,False,False,,False,jpg,True
44759,https://upload.wikimedia.org/wikipedia/vi/8/84...,1,Chùa một cột 1896,Chùa một cột 1896,Chùa một cột 1896,Chùa một cột,False,vi,0.999809,False,vi,False,True,Chua một cột 1896,True,jpg,True
44760,https://upload.wikimedia.org/wikipedia/vi/8/89...,1,Giáo hoàng Gioan Phao lô II chụp cùng HY Giuse...,Giáo hoàng Gioan Phao lô II chụp cùng HY Giuse...,Giáo hoàng Gioan Phao lô II chụp cùng HY Giuse...,Giáo hoàng Gioan Phao lô II chụp cùng HY Giuse...,False,vi,0.975191,False,vi,False,True,Giao hoang Gioan Phao lo II chụp cung HY Giuse...,True,png,False


In [9]:
matchings

Unnamed: 0,caption_title_and_reference_description,page_title,caption_reference_description
0,Albert Pike [SEP] Albert Pike,Albert Pike,Albert Pike
1,Anna Blount [SEP] Blount and her young daughte...,Anna Blount,"Blount and her young daughter Ruth, in 1911"
2,Río Marañón [SEP] Die Río Marañón in die Huánu...,Río Marañón,"Die Río Marañón in die Huánuco-streek, Peru"
3,Leonel Brizola [SEP] Brizola during his inaugu...,Leonel Brizola,Brizola during his inauguration ceremony as go...
4,Buttisholz [SEP] Buttisholz,Buttisholz,Buttisholz
...,...,...,...
92361,Essen (Haren) [SEP] Essen,Essen (Haren),Essen
92362,Mindszenty József [SEP] Boldog IV. Károly és f...,Mindszenty József,Boldog IV. Károly és felesége Zita királyné
92363,Пещеры Крыма [SEP] Вход в пещеру Суук-Коба (Хо...,Пещеры Крыма,Вход в пещеру Суук-Коба (Холодная)
92364,MOWAG Piranha [SEP] Usuaris del Piranha 5 en b...,MOWAG Piranha,Usuaris del Piranha 5 en blau.


In [10]:
matching_columns = ['page_title', 'caption_reference_description', 'title_translit', 'caption_translit', \
                    'page_title_lang', 'page_title_lang_p', 'page_title_en', 'caption_lang', 'caption_lang_p', 'caption_en', \
                    'caption_contains_digit', 'undigit_caption', 'page_title_contains_digit', 'undigit_page_title',
                    'target']

train_matchings = matchings

train_matchings['target'] = train_matchings['caption_title_and_reference_description']

train_matchings.drop(columns=['caption_title_and_reference_description'], inplace=True)

train_matchings['title_translit'] = train_matchings['page_title'].map(transliterate)
train_matchings['caption_translit'] = train_matchings['caption_reference_description'].map(transliterate)

r = train_matchings['page_title'].map(extract_lang)
train_matchings['page_title_lang'], train_matchings['page_title_lang_p'] = zip(*r)
train_matchings['page_title_en'] = train_matchings['page_title_lang'] == 'en'

r = train_matchings['caption_reference_description'].map(lambda o: o.replace('\n', ' ')).map(extract_lang)
train_matchings['caption_lang'], train_matchings['caption_lang_p'] = zip(*r)
train_matchings['caption_en'] = train_matchings['caption_lang'] == 'en'

train_matchings['caption_contains_digit'] = train_matchings['caption_reference_description'].str.contains(r'\d', regex=True)
train_matchings['undigit_caption'] = train_matchings['caption_reference_description'].str.replace(r'\d', '', regex=True).replace(r'\s{2,}', ' ', regex=True)

train_matchings['page_title_contains_digit'] = train_matchings['page_title'].str.contains(r'\d', regex=True)
train_matchings['undigit_page_title'] = train_matchings['page_title'].str.replace(r'\d', '', regex=True).replace(r'\s{2,}', ' ', regex=True)

train_matchings[matching_columns]

Unnamed: 0,page_title,caption_reference_description,title_translit,caption_translit,page_title_lang,page_title_lang_p,page_title_en,caption_lang,caption_lang_p,caption_en,caption_contains_digit,undigit_caption,page_title_contains_digit,undigit_page_title,target
0,Albert Pike,Albert Pike,,,en,0.124504,True,en,0.124504,True,False,Albert Pike,False,Albert Pike,Albert Pike [SEP] Albert Pike
1,Anna Blount,"Blount and her young daughter Ruth, in 1911",,,en,0.739186,True,en,0.971129,True,True,"Blount and her young daughter Ruth, in",False,Anna Blount,Anna Blount [SEP] Blount and her young daughte...
2,Río Marañón,"Die Río Marañón in die Huánuco-streek, Peru",Rio Maranon,"Die Rio Maranon in die Huanuco-streek, Peru",es,0.684134,False,de,0.376005,False,False,"Die Río Marañón in die Huánuco-streek, Peru",False,Río Marañón,Río Marañón [SEP] Die Río Marañón in die Huánu...
3,Leonel Brizola,Brizola during his inauguration ceremony as go...,,,en,0.361589,True,en,0.997266,True,True,Brizola during his inauguration ceremony as go...,False,Leonel Brizola,Leonel Brizola [SEP] Brizola during his inaugu...
4,Buttisholz,Buttisholz,,,en,0.346513,True,en,0.346513,True,False,Buttisholz,False,Buttisholz,Buttisholz [SEP] Buttisholz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92361,Essen (Haren),Essen,,,en,0.290392,True,en,0.124504,True,False,Essen,False,Essen (Haren),Essen (Haren) [SEP] Essen
92362,Mindszenty József,Boldog IV. Károly és felesége Zita királyné,Mindszenty Jozsef,Boldog IV. Karoly es felesege Zita kiralyne,de,0.358293,False,hu,0.987040,False,False,Boldog IV. Károly és felesége Zita királyné,False,Mindszenty József,Mindszenty József [SEP] Boldog IV. Károly és f...
92363,Пещеры Крыма,Вход в пещеру Суук-Коба (Холодная),Peschery Kryma,Vhod v pescheru Suuk-Koba (Holodnaja),ru,0.931901,False,ru,0.973208,False,False,Вход в пещеру Суук-Коба (Холодная),False,Пещеры Крыма,Пещеры Крыма [SEP] Вход в пещеру Суук-Коба (Хо...
92364,MOWAG Piranha,Usuaris del Piranha 5 en blau.,,,pt,0.832627,False,es,0.643328,False,True,Usuaris del Piranha en blau.,False,MOWAG Piranha,MOWAG Piranha [SEP] Usuaris del Piranha 5 en b...


In [11]:
train_matchings[matching_columns].to_csv('test-matchings.csv', index=False)
images2train.to_csv('test-images.csv', index=False)