# Corpora preprocessing

In [1]:
import pandas as pd
from pathlib import Path

corpus_path = '../dataset/'

corpus = pd.read_excel(Path(corpus_path , 'Annotated_Metaphor_Corpus_V2.xlsx'))

In [2]:
corpus

Unnamed: 0,Sentence,Metaphor,Type,Metaphor.1,Type.1,Metaphor.2,Type.2,Metaphor.3,Type.3,Metaphor.4,Type.4
0,"Dissento, però ne prendo atto. ParlaMint-IT_20...",yes,migrant,yes,migrant,yes,migrant,no,none,yes,migrant
1,"ading, I believe that many of us on these Be...",no,none,no,none,no,none,,,no,none
2,alla NATO e all'Europa. E allora ci vuole un...,yes,migrant,yes,migrant,yes,migrant,yes,migrant,no,none
3,"- credono di esserlo più dell'Italia, ma sono...",yes,migrant,yes,migrant,yes,migrant,yes,migrant,yes,migrant
4,"di giovedì e venerdì prossimi, non può conti...",yes,migrant,yes,migrant,yes,migrant,yes,migrant,yes,migrant
...,...,...,...,...,...,...,...,...,...,...,...
599,"tire, signor Presidente del Consiglio, anche ...",no,none,no,none,yes,migrant,no,none,no,none
600,tività della UE». Questo lo vedremo nel quadr...,no,none,no,none,,,no,none,no,none
601,una decisione finale - l'accordo con il Canad...,no,none,no,none,yes,migrant,no,none,no,none
602,whom it was £150. The charge was doubled by t...,no,none,no,none,no,none,no,none,no,none


In [3]:
import numpy as np
corpus = corpus.fillna(np.nan)

### Check for duplicates

In [4]:
len(list(corpus[corpus.Sentence.duplicated(keep=False)].groupby('Sentence')))

0

### Label regularization and aggregation

In [5]:
corpus['Type.1'].unique()

array(['migrant', 'none', 'covid', nan, 'Migrants'], dtype=object)

In [6]:
corpus['Type.1'] = corpus['Type.1'].replace('Migrants', 'migrant')

In [7]:
corpus['Type.1'].unique()

array(['migrant', 'none', 'covid', nan], dtype=object)

In [8]:
print(corpus['Metaphor'].value_counts())
corpus['Metaphor'] = corpus['Metaphor'].replace(' yes', 'yes')
corpus['Metaphor'] = corpus['Metaphor'].replace('  yes', 'yes')
corpus['Metaphor'] = corpus['Metaphor'].replace('yes ', 'yes')
print(corpus['Metaphor.1'].value_counts())
corpus['Metaphor.1'] = corpus['Metaphor.1'].replace('Yes', 'yes')
corpus['Metaphor.1'] = corpus['Metaphor.1'].replace('Yes ', 'yes')
corpus['Metaphor.1'] = corpus['Metaphor.1'].replace('yes ', 'yes')
print(corpus['Metaphor.2'].value_counts())
corpus['Metaphor.2'] = corpus['Metaphor.2'].replace('Yes ', 'yes')
corpus['Metaphor.2'] = corpus['Metaphor.2'].replace('yes ', 'yes')
corpus['Metaphor.2'] = corpus['Metaphor.2'].replace('Yes', 'yes')
print(corpus['Metaphor.3'].value_counts())
corpus['Metaphor.3'] = corpus['Metaphor.3'].replace('yes ', 'yes')
print(corpus['Metaphor.4'].value_counts())
corpus['Metaphor.4'] = corpus['Metaphor.4'].replace('none', 'no')

Metaphor
no       370
yes      185
  yes      1
yes        1
Name: count, dtype: int64
Metaphor.1
no      430
yes     143
Yes      17
yes       4
Yes       4
Name: count, dtype: int64
Metaphor.2
no      378
yes     204
Yes       9
yes       4
Yes       2
Name: count, dtype: int64
Metaphor.3
no      194
yes     130
yes      18
Name: count, dtype: int64
Metaphor.4
no      186
yes     116
none      1
Name: count, dtype: int64


### Annotator consensus

In [9]:
from collections import Counter

def majority_vote(row, columns):
    votes = row[columns].dropna()
    vote_counts = Counter(votes)
    max_votes = max(vote_counts.values())
    
    labels = [k for k,v in vote_counts.items() if v == max_votes]
    
    if len(labels) > 1:
        return 'Equal split: ' + ', '.join(labels)
    
    return labels[0]

corpus['label'] = corpus.apply(lambda x: majority_vote(x,  ['Type', 'Type.1', 'Type.2', 'Type.3', 'Type.4']), axis=1)

In [10]:
# Calculate the frequency of each class in 'label' column
class_counts = corpus['label'].value_counts()

print(class_counts)
# Calculate the percentage of each class
class_percentages = class_counts / len(corpus['label'].dropna()) * 100

print(class_percentages)

label
none                           408
migrant                        153
Equal split: migrant, none      25
Equal split: none, migrant      13
covid                            2
Equal split: covid, none         2
Equal split: covid, migrant      1
Name: count, dtype: int64
label
none                           67.549669
migrant                        25.331126
Equal split: migrant, none      4.139073
Equal split: none, migrant      2.152318
covid                           0.331126
Equal split: covid, none        0.331126
Equal split: covid, migrant     0.165563
Name: count, dtype: float64


We remove covid, as they are only 2 samples, 5 at best.

In [11]:
corpus['label_binary'] = corpus.apply(lambda x: majority_vote(x,['Metaphor', 'Metaphor.1', 'Metaphor.2', 'Metaphor.3', 'Metaphor.4']), axis=1)

In [12]:
# Calculate the frequency of each class in 'label' column
class_counts = corpus['label_binary'].value_counts()

print(class_counts)
# Calculate the percentage of each class
class_percentages = class_counts / len(corpus['label_binary'].dropna()) * 100

print(class_percentages)

label_binary
no                      407
yes                     156
Equal split: yes, no     27
Equal split: no, yes     14
Name: count, dtype: int64
label_binary
no                      67.384106
yes                     25.827815
Equal split: yes, no     4.470199
Equal split: no, yes     2.317881
Name: count, dtype: float64


In [13]:
corpus_filtered = corpus[corpus['label'].isin(['migrant', 'none'])] #removed covid
corpus_filtered_binary = corpus[corpus['label_binary'].isin(['yes', 'no'])]

In [14]:
corpus_filtered = corpus_filtered[['Sentence', 'label']]
corpus_filtered_binary = corpus_filtered_binary[['Sentence', 'label_binary']]

In [15]:
corpus_filtered

Unnamed: 0,Sentence,label
0,"Dissento, però ne prendo atto. ParlaMint-IT_20...",migrant
1,"ading, I believe that many of us on these Be...",none
2,alla NATO e all'Europa. E allora ci vuole un...,migrant
3,"- credono di esserlo più dell'Italia, ma sono...",migrant
4,"di giovedì e venerdì prossimi, non può conti...",migrant
...,...,...
599,"tire, signor Presidente del Consiglio, anche ...",none
600,tività della UE». Questo lo vedremo nel quadr...,none
601,una decisione finale - l'accordo con il Canad...,none
602,whom it was £150. The charge was doubled by t...,none


In [16]:
corpus_filtered_binary

Unnamed: 0,Sentence,label_binary
0,"Dissento, però ne prendo atto. ParlaMint-IT_20...",yes
1,"ading, I believe that many of us on these Be...",no
2,alla NATO e all'Europa. E allora ci vuole un...,yes
3,"- credono di esserlo più dell'Italia, ma sono...",yes
4,"di giovedì e venerdì prossimi, non può conti...",yes
...,...,...
599,"tire, signor Presidente del Consiglio, anche ...",no
600,tività della UE». Questo lo vedremo nel quadr...,no
601,una decisione finale - l'accordo con il Canad...,no
602,whom it was £150. The charge was doubled by t...,no


### ADDING THE OTHER DATASET FOR BINARY CLASSIFICATION

In [17]:
corpus_22 = pd.read_excel(Path(corpus_path , 'Annotated_Corpus_20_05_2023/Annotated_Corpus_02_10_2022.xlsx'))

In [18]:
corpus_22

Unnamed: 0,Sentence,Metaphor
0,Considerate che il nostro sistema sanitario na...,
1,Allo stesso tempo dobbiamo salvaguardare la ca...,Yes
2,Un ringraziamento sia alle associazioni di cat...,
3,Dobbiamo approfittare per svolgere questa emer...,Yes
4,Informativa mi consente di offrire al parlamen...,Yes
...,...,...
223,There is no parallel between the cases of the ...,yes
224,It seemed at the Battery yesterday as if a ver...,yes
225,Immigration officials at the principal Atlanti...,yes
226,I am quite sure that these conditions could be...,yes


### Regularization

In [19]:
print(corpus_22.Metaphor.value_counts())
corpus_22.Metaphor = corpus_22.Metaphor.replace('Yes', 'yes')
corpus_22.Metaphor = corpus_22.Metaphor.replace('none', 'no')
corpus_22 = corpus_22.rename(columns={'Metaphor': 'label'})

Metaphor
yes     165
none     47
Yes       7
Name: count, dtype: int64


In [20]:
corpus_22.label.value_counts()

label
yes    172
no      47
Name: count, dtype: int64

In [21]:
corpus_22[corpus_22.Sentence.duplicated()]

Unnamed: 0,Sentence,label
13,Pensate un po' chi se lo immaginava. è una pro...,no
60,Il premier italiano ha anche avanzato un'ipote...,yes
61,Gli americani possono permettersi di fare la v...,yes
62,Da febbraio ad ora siamo stati messi in guardi...,yes
63,"La precedente legge Bossi-Fini stabiliva, inve...",yes
64,"L'emergenza, dunque, valica il Mediterraneo e ...",yes
65,Fra gli arrestati per i disordini figurano anc...,yes
66,Oggi si disegnano piuttosto scenari da incubo:...,yes
67,"Le orde e poi le torme di immigrati, come defi...",yes
68,La prima questione riguarda il Centro di accog...,yes


In [22]:
corpus_filtered_binary = corpus_filtered_binary.rename(columns={'label_binary': 'label'})
corpus_filtered_binary = pd.concat([corpus_filtered_binary, corpus_22], ignore_index=True).dropna()

In [23]:
corpus_filtered_binary.label.value_counts()

label
no     454
yes    328
Name: count, dtype: int64

### Deduplication

In [24]:
corpus_filtered_binary = corpus_filtered_binary.drop_duplicates(keep='first')

In [25]:
corpus_filtered_binary.label.value_counts()

label
no     453
yes    318
Name: count, dtype: int64

In [26]:
any(corpus_filtered_binary.Sentence.duplicated())

False

In [27]:
from sklearn.model_selection import train_test_split

# Define features and target
X_b = corpus_filtered_binary.drop('label', axis=1)
y_b = corpus_filtered_binary['label']

# Split the data into training and temp (validation+test) datasets
X_train_b, X_temp, y_train_b, y_temp = train_test_split(X_b, y_b, test_size=0.2, stratify=y_b)

# Split the temp data into validation and test datasets
X_val_b, X_test_b, y_val_b, y_test_b = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp)

In [28]:
X_train_b['labels'] = y_train_b
X_val_b['labels'] = y_val_b
X_test_b['labels'] = y_test_b

In [29]:
X_train_b.labels.value_counts()

labels
no     362
yes    254
Name: count, dtype: int64

In [30]:
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler
ros = RandomOverSampler()

# Perform oversampling
X_resampled_b, y_resampled_b = ros.fit_resample(X_train_b.drop('labels', axis=1), X_train_b['labels'])

In [31]:
print(y_resampled_b.value_counts())
X_resampled_b['labels'] = y_resampled_b
X_resampled_b['split'] = 'train'
X_val_b['split'] = 'dev'
X_test_b['split'] = 'test'
final_df_binary = pd.concat([X_resampled_b, X_val_b, X_test_b])
final_df_binary.to_csv('binary_oversampling_filtered_ds_remove_discrepancies.csv', index=False)


labels
yes    362
no     362
Name: count, dtype: int64


In [32]:
from imblearn.under_sampling import RandomUnderSampler

# Instantiate the random oversampler
ros = RandomUnderSampler()

# Perform oversampling
X_undersampled_b, y_undersampled_b = ros.fit_resample(X_train_b.drop('labels', axis=1), X_train_b['labels'])

In [33]:
print(y_undersampled_b.value_counts())
X_undersampled_b['labels'] = y_undersampled_b
X_undersampled_b['split'] = 'train'

final_df_binary = pd.concat([X_undersampled_b, X_val_b, X_test_b])
final_df_binary.to_csv('binary_undersampling_filtered_ds_remove_discrepancies.csv', index=False)

labels
no     254
yes    254
Name: count, dtype: int64


### Stratify

In [34]:
from sklearn.model_selection import train_test_split

# Define features and target
X = corpus_filtered.drop('label', axis=1)
y = corpus_filtered['label']

# Split the data into training and temp (validation+test) datasets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, stratify=y)

# Split the temp data into validation and test datasets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp)

In [35]:
X_train['labels'] = y_train
X_val['labels'] = y_val
X_test['labels'] = y_test

In [36]:
X_train

Unnamed: 0,Sentence,labels
369,"ni) la strada della migrazione, che non è un f...",none
397,Supporting Integration Through New Roles and W...,none
494,se ne siete ancora capaci. Andate nelle stazi...,none
405,completely these dangerous crossings facilit...,migrant
418,rmula review before the next election. Parla...,none
...,...,...
30,ce from being undercut and ensure the UK’s ec...,migrant
569,n ID cards but on the statutory instrument. T...,none
413,motivazione a essere protagoniste della ripr...,migrant
248,ia a ca ese»). Mi sono incuriosito e sono anda...,none


### Over/Under sampling

In [37]:
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler
ros = RandomOverSampler()

# Perform oversampling
X_resampled, y_resampled = ros.fit_resample(X_train.drop('labels', axis=1), X_train['labels'])

In [38]:
from collections import Counter
Counter(y_resampled)

Counter({'none': 326, 'migrant': 326})

In [39]:
X_resampled['labels'] = y_resampled

In [40]:
X_resampled['split'] = 'train'
X_val['split'] = 'dev'
X_test['split'] = 'test'
final_df = pd.concat([X_resampled, X_val, X_test])
final_df.to_csv('oversampling_filtered_ds_remove_discrepancies.csv', index=False)

In [41]:
Counter(X_resampled.labels)

Counter({'none': 326, 'migrant': 326})

In [42]:
X_train['split'] = 'train'
X_val['split'] = 'dev'
X_test['split'] = 'test'

In [43]:
X_val.labels

178       none
586    migrant
61        none
154       none
307       none
344       none
66     migrant
359       none
124       none
414       none
320       none
493       none
341    migrant
443       none
391       none
46     migrant
521       none
82     migrant
390       none
522       none
571       none
108       none
527       none
439       none
357       none
575       none
343    migrant
40     migrant
580       none
508       none
337       none
552       none
438       none
112    migrant
484       none
366       none
463    migrant
114       none
516       none
198       none
33     migrant
0      migrant
175       none
229       none
131       none
568       none
309    migrant
505       none
538    migrant
398       none
316    migrant
59     migrant
524       none
408       none
342       none
558       none
Name: labels, dtype: object

In [44]:
from imblearn.under_sampling import RandomUnderSampler

# Instantiate the random oversampler
ros = RandomUnderSampler()

# Perform oversampling
X_resampled, y_resampled = ros.fit_resample(X_train.drop('labels', axis=1), X_train['labels'])

In [45]:
X_resampled['split'] = 'train'
final_df = pd.concat([X_resampled, X_val, X_test])
final_df.to_csv('undersampling_filtered_ds_remove_discrepancies.csv', index=False)

In [46]:
final_df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
275    False
281    False
83     False
468    False
133    False
Length: 357, dtype: bool

### Check if there are any sentence that surpases 512 tokens

In [30]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
['danger' for x in X_train.Sentence.apply(lambda x: tokenizer(x, truncation = False)) if len(x['input_ids']) >512]

[]