# Read text data

In [None]:
import pandas as pd

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/UR/NLE Seminar/code/data/preprocessed/train_df.csv', 
                       sep='\t', index_col=0)
train_df.head()

Unnamed: 0,target_word,phrase,target_preprocessed,context_preprocessed,img_0,img_1,img_2,img_3,img_4,img_5,img_6,img_7,img_8,img_9,non_eng
0,moorhen,moorhen swamphen,moorhen,swamphen,image.3.jpg,image.8.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.0.jpg,image.5.jpg,image.6.jpg,image.7.jpg,image.9.jpg,0
1,serinus,serinus genus,serinus,genus,image.3.jpg,image.23.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.20.jpg,image.5.jpg,image.24.jpg,image.22.jpg,image.21.jpg,0
2,pegmatite,pegmatite igneous,pegmatite,igneous,image.41.jpg,image.39.jpg,image.42.jpg,image.43.jpg,image.40.jpg,image.44.jpg,image.37.jpg,image.38.jpg,image.36.jpg,image.35.jpg,0
3,bangalores,bangalores torpedo,bangalores,torpedo,image.58.jpg,image.59.jpg,image.64.jpg,image.57.jpg,image.55.jpg,image.56.jpg,image.62.jpg,image.63.jpg,image.61.jpg,image.60.jpg,0
4,bonxie,bonxie skua,bonxie,skua,image.3.jpg,image.77.jpg,image.78.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.5.jpg,image.79.jpg,image.76.jpg,image.75.jpg,0


In [None]:
train_df.shape

(12869, 15)

How long are the phrases? ***--- two words except for one case***

In [None]:
train_df['phrase_length'] = train_df['phrase'].apply(lambda x: len(x.split()))
train_df['phrase_length'].mean()

2.000077706115471

In [None]:
train_df['phrase_length'].min(), train_df['phrase_length'].max()

(2, 3)

In [None]:
train_df[train_df['phrase_length'] == 3]

Unnamed: 0,target_word,phrase,img_0,img_1,img_2,img_3,img_4,img_5,img_6,img_7,img_8,img_9,img_truth,phrase_length
3829,areca-nut palm,areca-nut palm areca,image.3.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.6372.jpg,image.5869.jpg,image.5.jpg,image.5634.jpg,image.1501.jpg,image.10807.jpg,image.10807.jpg,3


# Wikipedia summaries

To enrich the text data, let's add a description to each phrase. As descriptions we will use snippets from Wikipedia articles

## Algorithm

* search for target phrase with `wikipedia.search`
* set the first result as `wiki_title`
* get summary of the article by this title with `wikipedia.summary`

* if there are no search results or summary request failed, do the same for context phrase instead of target
* save wiki title source (target / context)


In [None]:
def get_wiki_title(s):
  query_results = wikipedia.search(s)
  if query_results:
    return query_results[0]
  return None

In [None]:
def get_wiki_summary(s):
  try:
    return wikipedia.summary(s)
  except:
    return None

In [None]:
import numpy as np

In [None]:
def get_summary_col(row):
  target = row['target_preprocessed']
  context = row['context_preprocessed']

  wiki_title = get_wiki_title(target)
  if wiki_title is not None:
    summary = get_wiki_summary(wiki_title)
    if summary is not None:
      return 'target', wiki_title, summary

  wiki_title = get_wiki_title(context)
  if wiki_title is not None:
    summary = get_wiki_summary(wiki_title)
    if summary is not None:
      return 'context', wiki_title, summary

  return np.nan, np.nan, np.nan

## Get summaries for training data

In [None]:
train_df['wiki_summary'] = train_df.progress_apply(get_summary_col, axis=1)

100%|██████████| 12869/12869 [2:57:26<00:00,  1.21it/s]


Separate columns

In [None]:
train_df['summary_source'] = train_df['wiki_summary'].apply(
    lambda x: x[0]
)
train_df['wiki_title'] = train_df['wiki_summary'].apply(
    lambda x: x[1]
)
train_df['summary_content'] = train_df['wiki_summary'].apply(
    lambda x: x[2]
)

In [None]:
train_df = train_df.drop('wiki_summary', axis=1)

In [None]:
train_df.head()

Unnamed: 0,target_word,phrase,target_preprocessed,context_preprocessed,img_0,img_1,img_2,img_3,img_4,img_5,img_6,img_7,img_8,img_9,non_eng,summary_source,wiki_title,summary_content
0,moorhen,moorhen swamphen,moorhen,swamphen,image.3.jpg,image.8.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.0.jpg,image.5.jpg,image.6.jpg,image.7.jpg,image.9.jpg,0,target,Moorhen,Moorhens—sometimes called marsh hens—are mediu...
1,serinus,serinus genus,serinus,genus,image.3.jpg,image.23.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.20.jpg,image.5.jpg,image.24.jpg,image.22.jpg,image.21.jpg,0,target,Serinus,Serinus is a genus of small birds in the finch...
2,pegmatite,pegmatite igneous,pegmatite,igneous,image.41.jpg,image.39.jpg,image.42.jpg,image.43.jpg,image.40.jpg,image.44.jpg,image.37.jpg,image.38.jpg,image.36.jpg,image.35.jpg,0,target,Pegmatite,A pegmatite is an igneous rock showing a very ...
3,bangalores,bangalores torpedo,bangalores,torpedo,image.58.jpg,image.59.jpg,image.64.jpg,image.57.jpg,image.55.jpg,image.56.jpg,image.62.jpg,image.63.jpg,image.61.jpg,image.60.jpg,0,target,Bangalore torpedo,A Bangalore torpedo is an explosive charge pla...
4,bonxie,bonxie skua,bonxie,skua,image.3.jpg,image.77.jpg,image.78.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.5.jpg,image.79.jpg,image.76.jpg,image.75.jpg,0,target,Great skua,"The great skua (Stercorarius skua), sometimes ..."


1254 cases require manual preprocessing
* wiki article does not exist
* or phrase needs disambiguation

In [None]:
train_df['summary_source'].value_counts(dropna=False)

target     9935
context    1680
NaN        1254
Name: summary_source, dtype: int64

## Assign tasks for manual processing

In [None]:
train_df = pd.read_csv('train_df_8.csv', sep='\t', index_col=0)
train_df.head()

Unnamed: 0,target_word,phrase,target_preprocessed,context_preprocessed,img_0,img_1,img_2,img_3,img_4,img_5,img_6,img_7,img_8,img_9,non_eng,summary_source,wiki_title,summary_content
0,moorhen,moorhen swamphen,moorhen,swamphen,image.3.jpg,image.8.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.0.jpg,image.5.jpg,image.6.jpg,image.7.jpg,image.9.jpg,0,target,Moorhen,Moorhens—sometimes called marsh hens—are mediu...
1,serinus,serinus genus,serinus,genus,image.3.jpg,image.23.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.20.jpg,image.5.jpg,image.24.jpg,image.22.jpg,image.21.jpg,0,target,Serinus,Serinus is a genus of small birds in the finch...
2,pegmatite,pegmatite igneous,pegmatite,igneous,image.41.jpg,image.39.jpg,image.42.jpg,image.43.jpg,image.40.jpg,image.44.jpg,image.37.jpg,image.38.jpg,image.36.jpg,image.35.jpg,0,target,Pegmatite,A pegmatite is an igneous rock showing a very ...
3,bangalores,bangalores torpedo,bangalores,torpedo,image.58.jpg,image.59.jpg,image.64.jpg,image.57.jpg,image.55.jpg,image.56.jpg,image.62.jpg,image.63.jpg,image.61.jpg,image.60.jpg,0,target,Bangalore torpedo,A Bangalore torpedo is an explosive charge pla...
4,bonxie,bonxie skua,bonxie,skua,image.3.jpg,image.77.jpg,image.78.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.5.jpg,image.79.jpg,image.76.jpg,image.75.jpg,0,target,Great skua,"The great skua (Stercorarius skua), sometimes ..."


In [None]:
summary_source_tasks = train_df['summary_source'].to_list()
assignee = 'Andreas'
counter = 0

for ind, row in train_df.iterrows():
  if not pd.isna(row['summary_source']):
    continue
  summary_source_tasks[ind] = assignee
  counter += 1
  if counter == 1254 // 2:
    assignee = 'Kyuhee'

train_df['summary_source_tasks'] = pd.Series(summary_source_tasks)
train_df['summary_source_tasks'].value_counts()

target     9935
context    1680
Andreas     627
Kyuhee      627
Name: summary_source_tasks, dtype: int64

In [None]:
train_df.to_csv('train_df_9.csv', sep='\t')

## Read table with manually added summaries

In [None]:
train_df = pd.read_csv('/content/Wikipedia summaries - DataFrame.csv')
train_df = train_df.rename(columns={'target_word': 'target_original', 'target_word.1': 'target_word'})
train_df.head()

Unnamed: 0,target_original,phrase,target_word,context_word,img_0,img_1,img_2,img_3,img_4,img_5,img_6,img_7,img_8,img_9,non_eng,summary_source,summary_phrase,summary_content,summary_link,notes
0,moorhen,moorhen swamphen,moorhen,swamphen,image.3.jpg,image.8.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.0.jpg,image.5.jpg,image.6.jpg,image.7.jpg,image.9.jpg,0,target,Moorhen,Moorhens—sometimes called marsh hens—are mediu...,,
1,serinus,serinus genus,serinus,genus,image.3.jpg,image.23.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.20.jpg,image.5.jpg,image.24.jpg,image.22.jpg,image.21.jpg,0,target,Serinus,Serinus is a genus of small birds in the finch...,,
2,pegmatite,pegmatite igneous,pegmatite,igneous,image.41.jpg,image.39.jpg,image.42.jpg,image.43.jpg,image.40.jpg,image.44.jpg,image.37.jpg,image.38.jpg,image.36.jpg,image.35.jpg,0,target,Pegmatite,A pegmatite is an igneous rock showing a very ...,,
3,bangalores,bangalores torpedo,bangalores,torpedo,image.58.jpg,image.59.jpg,image.64.jpg,image.57.jpg,image.55.jpg,image.56.jpg,image.62.jpg,image.63.jpg,image.61.jpg,image.60.jpg,0,target,Bangalore torpedo,A Bangalore torpedo is an explosive charge pla...,,
4,bonxie,bonxie skua,bonxie,skua,image.3.jpg,image.77.jpg,image.78.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.5.jpg,image.79.jpg,image.76.jpg,image.75.jpg,0,target,Great skua,"The great skua (Stercorarius skua), sometimes ...",,


In [None]:
train_df.shape

(12869, 20)

In [None]:
train_df['summary_content'].isna().sum()

976

In [None]:
train_df[train_df['summary_content'].isna()]['summary_source'].value_counts()

Kyuhee     485
Andreas    472
target      13
context      6
Name: summary_source, dtype: int64

# Get summaries for test data

## English

In [None]:
df = pd.read_csv('/content/drive/MyDrive/UR/NLE Seminar/code/data/test.data/preprocessed/test_en.csv',
                 sep='\t', index_col=0)
df.head()

Unnamed: 0,target_word,phrase,target_preprocessed,context_preprocessed,img_0,img_1,img_2,img_3,img_4,img_5,img_6,img_7,img_8,img_9,non_eng
0,goal,football goal,football,goal,image.4418.jpg,image.4416.jpg,image.4417.jpg,image.4413.jpg,image.4412.jpg,image.4415.jpg,image.4419.jpg,image.4414.jpg,image.2166.jpg,image.1150.jpg,0
1,mustard,mustard seed,mustard,seed,image.4429.png,image.4422.jpg,image.4423.jpg,image.4424.jpg,image.4421.jpg,image.4427.jpg,image.4426.jpg,image.4420.jpg,image.4425.jpg,image.4428.jpg,0
2,seat,eating seat,eating,seat,image.4435.jpg,image.4436.jpg,image.1166.jpg,image.4430.jpg,image.4433.jpg,image.4432.jpg,image.4438.jpg,image.4434.jpg,image.4431.jpg,image.4437.jpg,0
3,navigate,navigate the web,navigate,the web,image.4439.jpg,image.4440.jpg,image.4441.jpg,image.4442.jpg,image.4444.jpg,image.4445.jpg,image.1435.jpg,image.4446.png,image.1434.jpg,image.4443.jpg,0
4,butterball,butterball person,butterball,person,image.4454.jpg,image.4450.jpg,image.4455.jpg,image.4453.jpg,image.4448.jpg,image.1253.jpg,image.4451.jpg,image.4452.jpg,image.4447.jpg,image.4449.jpg,0


In [None]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
df['wiki_summary'] = df.progress_apply(get_summary_col, axis=1)

100%|██████████| 463/463 [01:51<00:00,  4.16it/s]


In [None]:
df.at[40, 'target_word'] = 'river'
df.at[40, 'target_preprocessed'] = 'river'
df.at[40, 'phrase'] = 'river'
df.at[40, 'context_preprocessed'] = ''

df.at[367, 'target_word'] = 'bread'
df.at[367, 'target_preprocessed'] = 'bread'
df.at[367, 'phrase'] = 'bread'
df.at[367, 'context_preprocessed'] = ''

In [None]:
df['summary_source'] = df['wiki_summary'].apply(
    lambda x: x[0]
)
df['wiki_title'] = df['wiki_summary'].apply(
    lambda x: x[1]
)
df['summary_content'] = df['wiki_summary'].apply(
    lambda x: x[2]
)
df = df.drop('wiki_summary', axis=1)

In [None]:
df['summary_content'].isna().sum()

89

In [None]:
df.to_csv('test_en_with_summaries.csv', index=False)

## Italian

In [None]:
df = pd.read_csv('/content/drive/MyDrive/UR/NLE Seminar/code/data/test.data/preprocessed/test_it_translated.csv',
                 index_col=0)
df.head()

Unnamed: 0,target_word,phrase,img_0,img_1,img_2,img_3,img_4,img_5,img_6,img_7,img_8,img_9,target_eng,phrase_eng
0,gomma,gomma per smacchiare,image.3.jpg,image.7.jpg,image.8.jpg,image.1.jpg,image.9.jpg,image.5.jpg,image.6.jpg,image.4.jpg,image.0.jpg,image.2.jpg,eraser,rubber to remove
1,asino,asino gioco di carte,image.18.jpg,image.17.jpg,image.11.jpg,image.10.jpg,image.15.jpg,image.16.png,image.14.jpg,image.12.jpg,image.13.jpg,image.19.jpg,donkey,donkey card game
2,colonna,colonna missione,image.22.jpg,image.24.jpg,image.28.jpg,image.27.jpg,image.25.jpg,image.26.jpg,image.20.jpg,image.21.jpg,image.23.jpg,image.29.jpg,colonna,mission column
3,box,box per infanti,image.33.jpg,image.32.jpg,image.37.jpg,image.38.jpg,image.35.jpg,image.34.jpg,image.31.jpg,image.39.jpg,image.30.jpg,image.36.jpg,box,Box for infants
4,bolla,bolla di misurazione,image.48.jpg,image.44.jpg,image.41.jpg,image.46.jpg,image.47.jpg,image.42.jpg,image.40.jpg,image.45.jpg,image.43.PNG,image.49.jpg,bubble,measurement bubble


In [None]:
def get_summary_col(row):
  target = row['target_eng']
  phrase = row['phrase_eng']

  wiki_title = get_wiki_title(target)
  if wiki_title is not None:
    summary = get_wiki_summary(wiki_title)
    if summary is not None:
      return 'target', wiki_title, summary

  wiki_title = get_wiki_title(phrase)
  if wiki_title is not None:
    summary = get_wiki_summary(wiki_title)
    if summary is not None:
      return 'phrase', wiki_title, summary

  return np.nan, np.nan, np.nan

In [None]:
df['wiki_summary'] = df.progress_apply(get_summary_col, axis=1)

In [None]:
df['summary_source'] = df['wiki_summary'].apply(
    lambda x: x[0]
)
df['wiki_title'] = df['wiki_summary'].apply(
    lambda x: x[1]
)
df['summary_content'] = df['wiki_summary'].apply(
    lambda x: x[2]
)
df = df.drop('wiki_summary', axis=1)

In [None]:
df['summary_content'].isna().sum()

34

In [None]:
(df['summary_source'] == 'phrase').sum()

91

In [None]:
df.to_csv('test_it_with_summaries.csv')

## Farsi

In [None]:
df = pd.read_csv('/content/drive/MyDrive/UR/NLE Seminar/code/data/test.data/preprocessed/test_fa_translated.csv',
                 index_col=0)
df.head()

Unnamed: 0,target_word,phrase,img_0,img_1,img_2,img_3,img_4,img_5,img_6,img_7,img_8,img_9,target_eng,phrase_eng
0,برنج‎,فلز برنج,image.2732.jpg,image.2734.jpg,image.2727.jpg,image.2731.jpg,image.2735.jpg,image.2726.jpg,image.2729.jpg,image.2733.jpg,image.2730.jpg,image.2728.jpg,Rice,brass
1,ملخ,ملخ بادی,image.2743.jpg,image.2741.jpg,image.2740.jpg,image.921.jpg,image.2736.jpg,image.2744.jpg,image.2737.png,image.2742.jpg,image.2738.jpg,image.2739.jpg,Grasshopper,Wind locust
2,شام,سرزمین شام,image.2747.jpg,image.2749.jpg,image.2745.jpg,image.2753.jpg,image.2748.jpg,image.2754.jpg,image.2751.png,image.2746.jpg,image.2750.jpg,image.2752.jpg,the evening,The land of the evening
3,عدسی,عدسی نور,image.2761.jpg,image.2300.jpg,image.2758.jpg,image.2762.jpg,image.2760.jpg,image.2756.jpg,image.2759.jpg,image.2755.jpg,image.2757.jpg,image.95.jpg,Lenses,Light lens
4,توپ,توپ نظامی,image.2769.jpg,image.2766.jpg,image.2765.jpg,image.2763.jpg,image.2764.jpg,image.2770.jpg,image.2772.jpg,image.2767.jpg,image.2768.jpg,image.2771.png,Ball,Military ball


In [None]:
def get_summary_col(row):
  target = row['target_eng']
  phrase = row['phrase_eng']

  wiki_title = get_wiki_title(target)
  if wiki_title is not None:
    summary = get_wiki_summary(wiki_title)
    if summary is not None:
      return 'target', wiki_title, summary

  wiki_title = get_wiki_title(phrase)
  if wiki_title is not None:
    summary = get_wiki_summary(wiki_title)
    if summary is not None:
      return 'phrase', wiki_title, summary

  return np.nan, np.nan, np.nan

In [None]:
df['wiki_summary'] = df.progress_apply(get_summary_col, axis=1)

100%|██████████| 200/200 [03:23<00:00,  1.02s/it]


In [None]:
df['summary_source'] = df['wiki_summary'].apply(
    lambda x: x[0]
)
df['wiki_title'] = df['wiki_summary'].apply(
    lambda x: x[1]
)
df['summary_content'] = df['wiki_summary'].apply(
    lambda x: x[2]
)
df = df.drop('wiki_summary', axis=1)

In [None]:
df['summary_content'].isna().sum()

40

In [None]:
(df['summary_source'] == 'phrase').sum()

71

In [None]:
df.to_csv('test_fa_with_summaries.csv')