# Corpora preprocessing

In [65]:
import pandas as pd
from pathlib import Path

corpus_path = '../dataset/'

corpus = pd.read_excel(Path(corpus_path , 'Annotated_Metaphor_Corpus_V2.xlsx'))

In [66]:
corpus

Unnamed: 0,Sentence,Metaphor,Type,Metaphor.1,Type.1,Metaphor.2,Type.2,Metaphor.3,Type.3,Metaphor.4,Type.4
0,"Dissento, però ne prendo atto. ParlaMint-IT_20...",yes,migrant,yes,migrant,yes,migrant,no,none,yes,migrant
1,"ading, I believe that many of us on these Be...",no,none,no,none,no,none,,,no,none
2,alla NATO e all'Europa. E allora ci vuole un...,yes,migrant,yes,migrant,yes,migrant,yes,migrant,no,none
3,"- credono di esserlo più dell'Italia, ma sono...",yes,migrant,yes,migrant,yes,migrant,yes,migrant,yes,migrant
4,"di giovedì e venerdì prossimi, non può conti...",yes,migrant,yes,migrant,yes,migrant,yes,migrant,yes,migrant
...,...,...,...,...,...,...,...,...,...,...,...
599,"tire, signor Presidente del Consiglio, anche ...",no,none,no,none,yes,migrant,no,none,no,none
600,tività della UE». Questo lo vedremo nel quadr...,no,none,no,none,,,no,none,no,none
601,una decisione finale - l'accordo con il Canad...,no,none,no,none,yes,migrant,no,none,no,none
602,whom it was £150. The charge was doubled by t...,no,none,no,none,no,none,no,none,no,none


In [67]:
import numpy as np
corpus = corpus.fillna(np.nan)

### Check for duplicates

In [68]:
len(list(corpus[corpus.Sentence.duplicated(keep=False)].groupby('Sentence')))

0

### Label regularization and aggregation

In [69]:
corpus['Type.1'].unique()

array(['migrant', 'none', 'covid', nan, 'Migrants'], dtype=object)

In [70]:
corpus['Type.1'] = corpus['Type.1'].replace('Migrants', 'migrant')

In [71]:
corpus['Type.1'].unique()

array(['migrant', 'none', 'covid', nan], dtype=object)

In [72]:
print(corpus['Metaphor'].value_counts())
corpus['Metaphor'] = corpus['Metaphor'].replace(' yes', 'yes')
corpus['Metaphor'] = corpus['Metaphor'].replace('  yes', 'yes')
corpus['Metaphor'] = corpus['Metaphor'].replace('yes ', 'yes')
print(corpus['Metaphor.1'].value_counts())
corpus['Metaphor.1'] = corpus['Metaphor.1'].replace('Yes', 'yes')
corpus['Metaphor.1'] = corpus['Metaphor.1'].replace('Yes ', 'yes')
corpus['Metaphor.1'] = corpus['Metaphor.1'].replace('yes ', 'yes')
print(corpus['Metaphor.2'].value_counts())
corpus['Metaphor.2'] = corpus['Metaphor.2'].replace('Yes ', 'yes')
corpus['Metaphor.2'] = corpus['Metaphor.2'].replace('yes ', 'yes')
corpus['Metaphor.2'] = corpus['Metaphor.2'].replace('Yes', 'yes')
print(corpus['Metaphor.3'].value_counts())
corpus['Metaphor.3'] = corpus['Metaphor.3'].replace('yes ', 'yes')
print(corpus['Metaphor.4'].value_counts())
corpus['Metaphor.4'] = corpus['Metaphor.4'].replace('none', 'no')

Metaphor
no       370
yes      185
  yes      1
yes        1
Name: count, dtype: int64
Metaphor.1
no      430
yes     143
Yes      17
yes       4
Yes       4
Name: count, dtype: int64
Metaphor.2
no      378
yes     204
Yes       9
yes       4
Yes       2
Name: count, dtype: int64
Metaphor.3
no      194
yes     130
yes      18
Name: count, dtype: int64
Metaphor.4
no      186
yes     116
none      1
Name: count, dtype: int64


### Annotator consensus

In [73]:
from collections import Counter
def majority_vote(row, columns):
    votes = row[columns].dropna()
    vote_counts = Counter(votes)
    max_votes = max(vote_counts.values())
    labels = [k for k,v in vote_counts.items() if v == max_votes]
    if len(labels) > 1:
        return 'Equal split: ' + ', '.join(labels)
    else:
        return labels[0]

corpus['label'] = corpus.apply(lambda x: majority_vote(x,  ['Type', 'Type.1', 'Type.2', 'Type.3', 'Type.4']), axis=1)

In [74]:
# Calculate the frequency of each class in 'label' column
class_counts = corpus['label'].value_counts()

print(class_counts)
# Calculate the percentage of each class
class_percentages = class_counts / len(corpus['label'].dropna()) * 100

print(class_percentages)

label
none                           408
migrant                        153
Equal split: migrant, none      25
Equal split: none, migrant      13
covid                            2
Equal split: covid, none         2
Equal split: covid, migrant      1
Name: count, dtype: int64
label
none                           67.549669
migrant                        25.331126
Equal split: migrant, none      4.139073
Equal split: none, migrant      2.152318
covid                           0.331126
Equal split: covid, none        0.331126
Equal split: covid, migrant     0.165563
Name: count, dtype: float64


We remove covid, as they are only 2 samples, 5 at best.

In [75]:
corpus['label_binary'] = corpus.apply(lambda x: majority_vote(x,['Metaphor', 'Metaphor.1', 'Metaphor.2', 'Metaphor.3', 'Metaphor.4']), axis=1)

In [76]:
# Calculate the frequency of each class in 'label' column
class_counts = corpus['label_binary'].value_counts()

print(class_counts)
# Calculate the percentage of each class
class_percentages = class_counts / len(corpus['label_binary'].dropna()) * 100

print(class_percentages)

label_binary
no                      407
yes                     156
Equal split: yes, no     27
Equal split: no, yes     14
Name: count, dtype: int64
label_binary
no                      67.384106
yes                     25.827815
Equal split: yes, no     4.470199
Equal split: no, yes     2.317881
Name: count, dtype: float64


In [77]:
corpus_filtered = corpus[corpus['label'].isin(['migrant', 'none'])] #removed covid
corpus_filtered_binary = corpus[corpus['label_binary'].isin(['yes', 'no'])]

In [78]:
corpus_filtered = corpus_filtered[['Sentence', 'label']]
corpus_filtered_binary = corpus_filtered_binary[['Sentence', 'label_binary']]

In [79]:
corpus_filtered

Unnamed: 0,Sentence,label
0,"Dissento, però ne prendo atto. ParlaMint-IT_20...",migrant
1,"ading, I believe that many of us on these Be...",none
2,alla NATO e all'Europa. E allora ci vuole un...,migrant
3,"- credono di esserlo più dell'Italia, ma sono...",migrant
4,"di giovedì e venerdì prossimi, non può conti...",migrant
...,...,...
599,"tire, signor Presidente del Consiglio, anche ...",none
600,tività della UE». Questo lo vedremo nel quadr...,none
601,una decisione finale - l'accordo con il Canad...,none
602,whom it was £150. The charge was doubled by t...,none


In [80]:
corpus_filtered_binary

Unnamed: 0,Sentence,label_binary
0,"Dissento, però ne prendo atto. ParlaMint-IT_20...",yes
1,"ading, I believe that many of us on these Be...",no
2,alla NATO e all'Europa. E allora ci vuole un...,yes
3,"- credono di esserlo più dell'Italia, ma sono...",yes
4,"di giovedì e venerdì prossimi, non può conti...",yes
...,...,...
599,"tire, signor Presidente del Consiglio, anche ...",no
600,tività della UE». Questo lo vedremo nel quadr...,no
601,una decisione finale - l'accordo con il Canad...,no
602,whom it was £150. The charge was doubled by t...,no


### ADDING THE OTHER DATASET FOR BINARY CLASSIFICATION

In [81]:
corpus_22 = pd.read_excel(Path(corpus_path , 'Annotated_Corpus_20_05_2023/Annotated_Corpus_02_10_2022.xlsx'))

In [82]:
corpus_22

Unnamed: 0,Sentence,Metaphor
0,Considerate che il nostro sistema sanitario na...,
1,Allo stesso tempo dobbiamo salvaguardare la ca...,Yes
2,Un ringraziamento sia alle associazioni di cat...,
3,Dobbiamo approfittare per svolgere questa emer...,Yes
4,Informativa mi consente di offrire al parlamen...,Yes
...,...,...
223,There is no parallel between the cases of the ...,yes
224,It seemed at the Battery yesterday as if a ver...,yes
225,Immigration officials at the principal Atlanti...,yes
226,I am quite sure that these conditions could be...,yes


### Regularization

In [83]:
print(corpus_22.Metaphor.value_counts())
corpus_22.Metaphor = corpus_22.Metaphor.replace('Yes', 'yes')
corpus_22.Metaphor = corpus_22.Metaphor.replace('none', 'no')
corpus_22 = corpus_22.rename(columns={'Metaphor': 'label'})

Metaphor
yes     165
none     47
Yes       7
Name: count, dtype: int64


In [84]:
corpus_22.label.value_counts()

label
yes    172
no      47
Name: count, dtype: int64

In [85]:
corpus_filtered_binary = corpus_filtered_binary.rename(columns={'label_binary': 'label'})
corpus_filtered_binary = pd.concat([corpus_filtered_binary, corpus_22], ignore_index=True).dropna()

In [86]:
corpus_filtered_binary.label.value_counts()

label
no     454
yes    328
Name: count, dtype: int64

### Deduplication

In [87]:
corpus_filtered_binary = corpus_filtered_binary.drop_duplicates(keep='first')

In [88]:
corpus_filtered_binary.label.value_counts()

label
no     453
yes    318
Name: count, dtype: int64

In [89]:
any(corpus_filtered_binary.Sentence.duplicated())

False

In [90]:
from sklearn.model_selection import train_test_split

# Define features and target
X_b = corpus_filtered_binary.drop('label', axis=1)
y_b = corpus_filtered_binary['label']

# Split the data into training and temp (validation+test) datasets
X_train_b, X_temp, y_train_b, y_temp = train_test_split(X_b, y_b, test_size=0.2, stratify=y_b)

# Split the temp data into validation and test datasets
X_val_b, X_test_b, y_val_b, y_test_b = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp)

In [91]:
X_train_b['labels'] = y_train_b
X_val_b['labels'] = y_val_b
X_test_b['labels'] = y_test_b

In [92]:
X_train_b.labels.value_counts()

labels
no     362
yes    254
Name: count, dtype: int64

In [93]:
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler
ros = RandomOverSampler()

# Perform oversampling
X_resampled_b, y_resampled_b = ros.fit_resample(X_train_b.drop('labels', axis=1), X_train_b['labels'])

In [94]:
print(y_resampled_b.value_counts())
X_resampled_b['labels'] = y_resampled_b
X_resampled_b['split'] = 'train'
X_val_b['split'] = 'dev'
X_test_b['split'] = 'test'
final_df_binary = pd.concat([X_resampled_b, X_val_b, X_test_b])
final_df_binary.to_csv('binary_oversampling_filtered_ds_remove_discrepancies.csv', index=False)


labels
yes    362
no     362
Name: count, dtype: int64


In [95]:
from imblearn.under_sampling import RandomUnderSampler

# Instantiate the random oversampler
ros = RandomUnderSampler()

# Perform oversampling
X_undersampled_b, y_undersampled_b = ros.fit_resample(X_train_b.drop('labels', axis=1), X_train_b['labels'])

In [96]:
print(y_undersampled_b.value_counts())
X_undersampled_b['labels'] = y_undersampled_b
X_undersampled_b['split'] = 'train'

final_df_binary = pd.concat([X_undersampled_b, X_val_b, X_test_b])
final_df_binary.to_csv('binary_undersampling_filtered_ds_remove_discrepancies.csv', index=False)

labels
no     254
yes    254
Name: count, dtype: int64


### Stratify

In [97]:
from sklearn.model_selection import train_test_split

# Define features and target
X = corpus_filtered.drop('label', axis=1)
y = corpus_filtered['label']

# Split the data into training and temp (validation+test) datasets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, stratify=y)

# Split the temp data into validation and test datasets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp)

In [98]:
X_train['labels'] = y_train
X_val['labels'] = y_val
X_test['labels'] = y_test

In [99]:
X_train

Unnamed: 0,Sentence,labels
223,will reduce the scope for error and better ens...,none
205,"e affrontata con il metro dell'emergenza, che ...",none
295,terms of encouraging employers to employ UK wo...,none
99,"uzionale. Signor Presidente, colleghi, riteni...",migrant
285,"out physical proof of their status, for which ...",none
...,...,...
152,crisis. Recent Home Office statistics show tha...,none
439,ctive offer for migrants. It provides near co...,none
34,d that they are about the immigration skills ...,migrant
202,"and AQA, currently taught to less than 10% of ...",none


### Over/Under sampling

In [104]:
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler
ros = RandomOverSampler()

# Perform oversampling
X_resampled, y_resampled = ros.fit_resample(X_train.drop('labels', axis=1), X_train['labels'])

In [105]:
from collections import Counter
Counter(y_resampled)

Counter({'none': 326, 'migrant': 326})

In [106]:
X_resampled['labels'] = y_resampled

In [110]:
X_resampled['split'] = 'train'
X_val['split'] = 'dev'
X_test['split'] = 'test'
final_df = pd.concat([X_resampled, X_val, X_test])
final_df.to_csv('oversampling_filtered_ds_remove_discrepancies.csv', index=False)

                                              Sentence   labels  split
448  plomatica all'interno dello scenario libico, e...  migrant  train
449  È, inoltre, in corso una discussione sulla lib...  migrant  train
450  rvice providers, decision-makers and leaders f...  migrant  train
451   esente disegno di legge nell'intento di rende...  migrant  train
452   ble Lord was making a comparison is that they...  migrant  train
..                                                 ...      ...    ...
647  ska il mare che inghiotte villaggi, la tundra ...  migrant  train
648   ricollocamenti. Sappiamo che si parla di qual...  migrant  train
649    completely these dangerous crossings facilit...  migrant  train
650  ndment would add to the burden on businesses, ...  migrant  train
651  é si tratta di un vero e proprio pot-pourri di...  migrant  train

[204 rows x 3 columns]


In [108]:
Counter(X_resampled.labels)

Counter({'none': 326, 'migrant': 326})

In [None]:
X_train['split'] = 'train'
X_val['split'] = 'dev'
X_test['split'] = 'test'

In [None]:
X_val.labels

489       none
799    migrant
344    migrant
116       none
772    migrant
        ...   
388       none
469       none
436       none
56     migrant
216       none
Name: labels, Length: 82, dtype: object

In [None]:
from imblearn.under_sampling import RandomUnderSampler

# Instantiate the random oversampler
ros = RandomUnderSampler()

# Perform oversampling
X_resampled, y_resampled = ros.fit_resample(X_train.drop('labels', axis=1), X_train['labels'])

In [103]:
X_resampled['split'] = 'train'
final_df = pd.concat([X_resampled, X_val, X_test])
final_df.to_csv('undersampling_filtered_ds_remove_discrepancies.csv', index=False)

NameError: name 'final_df' is not defined

In [102]:
final_df.duplicated()

NameError: name 'final_df' is not defined

### Check if there are any sentence that surpases 512 tokens

In [30]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
['danger' for x in X_train.Sentence.apply(lambda x: tokenizer(x, truncation = False)) if len(x['input_ids']) >512]

[]

### Some results

In [21]:
import numpy as np
"New best hyperparameters: {'lr': 3e-05, 'epochs': 2, 'batchsize': 8}"
# Try without 8 like bert model
data = np.array([0.7330779054916986, 0.5515734265734266, 0.6305555555555556, 0.5725, 0.6360153256704981, 0.6524390243902438, 0.7732368896925859, 0.5777777777777778, 0.7799227799227799, 0.6157303370786517])

avg = np.mean(data)
std_dev = np.std(data)

print(f"Average: {avg}")
print(f"Standard Deviation: {std_dev}")


Average: 0.6522829022153218
Standard Deviation: 0.07847368195222722


In [35]:
#random undersampling
#{'lr': 5e-05, 'epochs': 4, 'batchsize': 8}
data = np.array([0.7470558115719407, 0.7544324772162385, 0.7470558115719407, 0.7898989898989899, 0.7410449065175607, 0.7494505494505495, 0.7442622950819673, 0.7636363636363637, 0.8021978021978022, 0.8021978021978022])
avg = np.mean(data)
std_dev = np.std(data)

print(f"Average: {avg}")
print(f"Standard Deviation: {std_dev}")

Average: 0.7641232809341154
Standard Deviation: 0.02320092772172989


In [36]:
#random oversampling
#{'lr': 5e-05, 'epochs': 3, 'batchsize': 16}
data = np.array([0.7442622950819673, 0.752990851513019, 0.7590940288263555, 0.7590940288263555, 0.7350543478260869, 0.7085597826086957, 0.7858613589567605, 0.7173425366695427, 0.8003072196620584, 0.7470558115719407])
avg = np.mean(data)
std_dev = np.std(data)

print(f"Average: {avg}")
print(f"Standard Deviation: {std_dev}")

Average: 0.7509622261542782
Standard Deviation: 0.02651547866575898


In [63]:

# Original data
data_test_xxx = {
    'eval_loss': 0.62,
    'eval_f1': 60.18,
    'eval_accuracy': 63.68,
    'eval_precision': 59.98,
    'eval_recall': 66.18,
    'epoch': 4
}

data_std = {
    'eval_loss': 0.07,
    'eval_f1': 15.8,
    'eval_accuracy': 14.67,
    'eval_precision': 17.93,
    'eval_recall': 11.05,
    'epoch': 0
}

# Convert to DataFrame
df_test_xxx = pd.DataFrame.from_dict(data_test_xxx, orient='index', columns=['Test_eval'])
df_std = pd.DataFrame.from_dict(data_std, orient='index', columns=['std'])

# Multiply std by 100 and round to 2 decimal places
df_std['std'] = df_std['std'].multiply(100).round(2)

# Join the two dataframes
df = df_test_xxx.join(df_std)

print(df)

                Test_eval     std
eval_loss            0.62     7.0
eval_f1             60.18  1580.0
eval_accuracy       63.68  1467.0
eval_precision      59.98  1793.0
eval_recall         66.18  1105.0
epoch                4.00     0.0


In [64]:
df

Unnamed: 0,Test_eval,std
eval_loss,0.62,7.0
eval_f1,60.18,1580.0
eval_accuracy,63.68,1467.0
eval_precision,59.98,1793.0
eval_recall,66.18,1105.0
epoch,4.0,0.0
