# NLP Coursework


## 0. Set-up

### Main imports

In [3]:
!pip install simpletransformers
!pip install sentence-transformers
!pip install tensorboardx



In [4]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs, MultiLabelClassificationModel, MultiLabelClassificationArgs
from urllib import request
import pandas as pd
import logging
import torch

from imblearn.under_sampling import TomekLinks
from sentence_transformers import SentenceTransformer

from collections import Counter
from ast import literal_eval
import string
import nltk
from nltk.corpus import stopwords

In [5]:
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# check gpu
cuda_available = torch.cuda.is_available()

print('Cuda available? ',cuda_available)

Cuda available?  True


In [6]:
if cuda_available:
  import tensorflow as tf
  # Get the GPU device name.
  device_name = tf.test.gpu_device_name()
  # The device name should look like the following:
  if device_name == '/device:GPU:0':
      print('Found GPU at: {}'.format(device_name))
  else:
      raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


### Fetching the Don't Patronize Me! data manager module

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
!wget https://raw.githubusercontent.com/CRLala/NLPLabs-2024/main/Dont_Patronize_Me_Trainingset/dontpatronizeme_categories.tsv
!wget https://raw.githubusercontent.com/CRLala/NLPLabs-2024/main/Dont_Patronize_Me_Trainingset/dontpatronizeme_pcl.tsv

--2024-03-03 17:07:34--  https://raw.githubusercontent.com/CRLala/NLPLabs-2024/main/Dont_Patronize_Me_Trainingset/dontpatronizeme_categories.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1342370 (1.3M) [text/plain]
Saving to: ‘dontpatronizeme_categories.tsv’


2024-03-03 17:07:34 (62.0 MB/s) - ‘dontpatronizeme_categories.tsv’ saved [1342370/1342370]

--2024-03-03 17:07:34--  https://raw.githubusercontent.com/CRLala/NLPLabs-2024/main/Dont_Patronize_Me_Trainingset/dontpatronizeme_pcl.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK

In [9]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [10]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/evaluation.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/evaluation.py


In [11]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

In [12]:
from dont_patronize_me import DontPatronizeMe
dpm = DontPatronizeMe('.', '.')
dpm.load_task1()
train_df=dpm.train_task1_df
train_df.head()

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0,0


# Load paragraph IDs

In [13]:
train_ids  = pd.read_csv('internal_train_par_ids.csv')
dev_ids  = pd.read_csv('internal_dev_par_ids.csv')
teids = pd.read_csv('dev_semeval_parids-labels.csv')

print(train_ids)
print(dev_ids)
print(teids)

      par_id
0       6477
1       4850
2       2473
3       5344
4       7070
...      ...
6276     679
6277    8333
6278     449
6279    2437
6280    6944

[6281 rows x 1 columns]
      par_id
0       4928
1       3847
2        279
3       1038
4       1879
...      ...
2089   10027
2090    6857
2091    5847
2092    3189
2093    6658

[2094 rows x 1 columns]
      par_id                  label
0       4046  [1, 0, 0, 1, 0, 0, 0]
1       1279  [0, 1, 0, 0, 0, 0, 0]
2       8330  [0, 0, 1, 0, 0, 0, 0]
3       4063  [1, 0, 0, 1, 1, 1, 0]
4       4089  [1, 0, 0, 0, 0, 0, 0]
...      ...                    ...
2089   10462  [0, 0, 0, 0, 0, 0, 0]
2090   10463  [0, 0, 0, 0, 0, 0, 0]
2091   10464  [0, 0, 0, 0, 0, 0, 0]
2092   10465  [0, 0, 0, 0, 0, 0, 0]
2093   10466  [0, 0, 0, 0, 0, 0, 0]

[2094 rows x 2 columns]


In [14]:
train_ids.par_id = train_ids.par_id.astype(str)
dev_ids.par_id = dev_ids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)

data=dpm.train_task1_df

data

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0,0
...,...,...,...,...,...,...,...
10464,10465,@@14297363,women,lk,"""Sri Lankan norms and culture inhibit women fr...",0,1
10465,10466,@@70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0,0
10466,10467,@@20282330,in-need,ng,""""""" She has one huge platform , and informatio...",1,3
10467,10468,@@16753236,hopeless,in,""""""" Anja Ringgren Loven I ca n't find a word t...",1,4


# A. Random Undersampling

## Rebuild training set

In [15]:
rows = [] # will contain par_id, label and text
for idx in range(len(train_ids)):
  parid = train_ids.par_id[idx]
  #print(parid)
  # select row from original dataset to retrieve `text` and binary label
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })

traindf = pd.DataFrame(rows)
traindf

Unnamed: 0,par_id,community,text,label
0,6477,refugee,Tensions remain high at Australia 's island re...,0
1,4850,poor-families,"""Picariello , also known as """" Emperor Pic , ""...",0
2,2473,refugee,"""Irungu , who also serves as the head of the N...",0
3,5344,disabled,"Mrs. Aviva Dankner , owner of Castra Mall , al...",0
4,7070,hopeless,2 : The ' Check Engine ' and ' ABS ' lights ca...,0
...,...,...,...,...
6276,679,in-need,"""Many argue this is inevitable -- that rising ...",0
6277,8333,refugee,"The mother of 6 , including 3 adopted children...",0
6278,449,disabled,HSE says it ca n't pay for services to help di...,0
6279,2437,in-need,A second T-Home project is being launched in t...,0


In [16]:
traindf.label.value_counts()

0    5686
1     595
Name: label, dtype: int64

### Downsampling

In [17]:
from sklearn.utils import resample
torch.manual_seed(6)
traindf_majority = traindf[traindf['label'] == 0]
traindf_minority = traindf[traindf['label'] == 1]

# Downsample majority class
traindf_majority_downsampled = resample(traindf_majority,
                                   replace=False,  # sample without replacement
                                   n_samples=len(traindf_minority),  # match number in minority class
                                   random_state=42)  # reproducible results

traindf_downsampled = pd.concat([traindf_majority_downsampled, traindf_minority])

traindf_downsampled = traindf_downsampled.sample(frac=1, random_state=42).reset_index(drop=True)

traindf_downsampled

Unnamed: 0,par_id,community,text,label
0,4594,women,ISIS has justified the enslavement of non-Musl...,0
1,4133,women,""""""" Still , malaria remains one of the top cau...",0
2,7600,refugee,The UN Refugee Agency gave the caution in a st...,0
3,4761,disabled,A flawed system let a man accused of sexually ...,0
4,5501,refugee,In a new joint The Hollywood Reporter intervie...,1
...,...,...,...,...
1185,4960,in-need,The rehabilitation of the health center is und...,1
1186,7538,refugee,"""LONDON - Angelia Jolie has urged people to """"...",1
1187,9584,hopeless,"According to Betty-Ann Blaine , executive dire...",1
1188,3857,disabled,Six other nominated senators will be nominated...,1


In [18]:
traindf_downsampled.label.value_counts()

0    595
1    595
Name: label, dtype: int64

## Rebuild test set

In [19]:
rows = [] # will contain par_id, label and text
for idx in range(len(dev_ids)):
  parid = dev_ids.par_id[idx]
  #print(parid)
  # select row from original dataset
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })

devdf = pd.DataFrame(rows)
devdf

Unnamed: 0,par_id,community,text,label
0,4928,hopeless,""""""" I went to Vauxhall High School , graduated...",0
1,3847,refugee,""""""" These children were predominantly Burmese ...",0
2,279,women,A submission from the Irish Women 's Council o...,0
3,1038,homeless,But displaced people were not left homeless by...,1
4,1879,immigrant,Studies reveal that the core of immigrant entr...,0
...,...,...,...,...
2089,10027,vulnerable,"Here in Jamaica , it is an unfortunate fact th...",1
2090,6857,hopeless,The hospital has managed to find some extra wa...,0
2091,5847,homeless,"Last year , a record 85 homes were demolished ...",0
2092,3189,in-need,Persons in need of a wheelchair or know of any...,0


### Random Downsampling Performance

In [20]:
traindf_downsampled['label'].value_counts()

0    595
1    595
Name: label, dtype: int64

In [21]:
torch.manual_seed(6)
task1_model_args = ClassificationArgs(num_train_epochs=10,
                                      no_save=True,
                                      no_cache=True,
                                      overwrite_output_dir=True)
task1_model = ClassificationModel("roberta",
                                  'roberta-base',
                                  args = task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available)
# train model
task1_model.train_model(traindf_downsampled[['text', 'label']])
# run predictions
preds_task1, _ = task1_model.predict(devdf.text.tolist())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/149 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/149 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/149 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/149 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/149 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/149 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/149 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/149 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/149 [00:00<?, ?it/s]

Running Epoch 10 of 10:   0%|          | 0/149 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

In [22]:
Counter(preds_task1)

Counter({0: 1630, 1: 464})

In [23]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

print("precision: ", precision_score(devdf.label.values, preds_task1))
print("recall： ", recall_score(devdf.label.values, preds_task1))
print("f1_score: ", f1_score(devdf.label.values, preds_task1))

precision:  0.33836206896551724
recall：  0.7889447236180904
f1_score:  0.47360482654600305


# B. Random Oversampling

In [24]:
torch.manual_seed(6)
traindf_majority = traindf[traindf['label'] == 0]
traindf_minority = traindf[traindf['label'] == 1]

# Oversample minority class
traindf_minority_oversampled = resample(traindf_minority,
                                   replace=True,
                                   n_samples=len(traindf_majority),
                                   random_state=42)

traindf_oversampled = pd.concat([traindf_majority, traindf_minority_oversampled])

traindf_oversampled = traindf_oversampled.sample(frac=1, random_state=42).reset_index(drop=True)

traindf_oversampled

Unnamed: 0,par_id,community,text,label
0,4567,refugee,"""Brussels - Top European Union official Donald...",0
1,8819,refugee,Who blame for this issue ? ? The system itself...,1
2,4960,in-need,The rehabilitation of the health center is und...,1
3,5852,hopeless,""""""" The people of Khyber Pakhtunkhwa are resil...",1
4,4754,women,NAN also reports that the initiative provides ...,0
...,...,...,...,...
11367,7332,refugee,UN app lets you know what life as a refugee is...,1
11368,1810,immigrant,"""Castillo is thrilled that her novel -- popula...",0
11369,2965,migrant,From refugee to runway : How migrants are tran...,0
11370,1594,migrant,"""During a fact-finding visit to Algeciras on S...",0


In [25]:
traindf_oversampled['label'].value_counts()

0    5686
1    5686
Name: label, dtype: int64

Random Oversampling Performance

In [26]:
torch.manual_seed(6)
task1_model_args = ClassificationArgs(num_train_epochs=10,
                                      no_save=True,
                                      no_cache=True,
                                      overwrite_output_dir=True)
task1_model = ClassificationModel("roberta",
                                  'roberta-base',
                                  args = task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available)
# train model
task1_model.train_model(traindf_oversampled[['text', 'label']])
# run predictions
preds_task1, _ = task1_model.predict(devdf.text.tolist())

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/1422 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/1422 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/1422 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/1422 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/1422 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/1422 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/1422 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/1422 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/1422 [00:00<?, ?it/s]

Running Epoch 10 of 10:   0%|          | 0/1422 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

In [27]:
Counter(preds_task1)

Counter({0: 1924, 1: 170})

In [28]:
print("precision: ", precision_score(devdf.label.values, preds_task1))
print("recall： ", recall_score(devdf.label.values, preds_task1))
print("f1_score: ", f1_score(devdf.label.values, preds_task1))

precision:  0.5882352941176471
recall：  0.5025125628140703
f1_score:  0.5420054200542005


# C. Downsampling with Tomek Links

In [29]:
torch.manual_seed(6)
model = SentenceTransformer('all-distilroberta-v1')

# Generate embeddings for text data
text_embeddings = model.encode(traindf['text'].tolist())

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [30]:
text_embeddings

array([[-0.00795926, -0.00901499, -0.01933391, ...,  0.01620265,
        -0.05670098,  0.09961925],
       [ 0.01198695, -0.00150292, -0.01225506, ..., -0.05306108,
         0.03385483, -0.05027746],
       [ 0.03657614, -0.04963855, -0.02666638, ..., -0.00375456,
        -0.05430744,  0.13642639],
       ...,
       [ 0.03938799, -0.02133241,  0.00239186, ...,  0.01397417,
         0.02057148, -0.00600493],
       [ 0.00218787, -0.06948517,  0.00087385, ...,  0.02676315,
        -0.01474146, -0.02348447],
       [-0.01074873,  0.00194173, -0.02051295, ..., -0.04439187,
        -0.00441707,  0.04782172]], dtype=float32)

In [31]:
embeddings_df = pd.DataFrame(text_embeddings)

In [32]:
# Identify Tomek links
tomek_links = TomekLinks()
X_resampled, y_resampled = tomek_links.fit_resample(embeddings_df, traindf['label'].tolist())

In [33]:
traindf_resampled = traindf.iloc[X_resampled.index].reset_index(drop=True)
traindf_resampled

Unnamed: 0,par_id,community,text,label
0,6477,refugee,Tensions remain high at Australia 's island re...,0
1,4850,poor-families,"""Picariello , also known as """" Emperor Pic , ""...",0
2,2473,refugee,"""Irungu , who also serves as the head of the N...",0
3,5344,disabled,"Mrs. Aviva Dankner , owner of Castra Mall , al...",0
4,7070,hopeless,2 : The ' Check Engine ' and ' ABS ' lights ca...,0
...,...,...,...,...
6166,7570,in-need,TEAM Pentagon from Cavite State University was...,0
6167,6926,refugee,"""In Hyderabad , there are 3,800 Rohangiyas wit...",0
6168,1792,women,Wikipedia explains feminism as a range of poli...,0
6169,5977,disabled,""""""" Our life has completely changed from when ...",1


In [34]:
torch.manual_seed(6)
task1_model_args = ClassificationArgs(num_train_epochs=10,
                                      no_save=True,
                                      no_cache=True,
                                      overwrite_output_dir=True)
task1_model = ClassificationModel("roberta",
                                  'roberta-base',
                                  args = task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available)
# train model
task1_model.train_model(traindf_resampled[['text', 'label']])
# run predictions
preds_task1, _ = task1_model.predict(devdf.text.tolist())

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/12 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/772 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/772 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/772 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/772 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/772 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/772 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/772 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/772 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/772 [00:00<?, ?it/s]

Running Epoch 10 of 10:   0%|          | 0/772 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

In [35]:
Counter(preds_task1)

Counter({0: 1922, 1: 172})

In [36]:
print("precision: ", precision_score(devdf.label.values, preds_task1))
print("recall： ", recall_score(devdf.label.values, preds_task1))
print("f1_score: ", f1_score(devdf.label.values, preds_task1))

precision:  0.5581395348837209
recall：  0.4824120603015075
f1_score:  0.5175202156334232


Test baseline

In [37]:
torch.manual_seed(6)
task1_model_args = ClassificationArgs(num_train_epochs=10,
                                      no_save=True,
                                      no_cache=True,
                                      overwrite_output_dir=True)
task1_model = ClassificationModel("roberta",
                                  'roberta-base',
                                  args = task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available)
# train model
task1_model.train_model(traindf[['text', 'label']])
# run predictions
preds_task1, _ = task1_model.predict(devdf.text.tolist())

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/12 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/786 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/786 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/786 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/786 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/786 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/786 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/786 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/786 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/786 [00:00<?, ?it/s]

Running Epoch 10 of 10:   0%|          | 0/786 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

In [38]:
Counter(preds_task1)

Counter({0: 1881, 1: 213})

In [39]:
print("precision: ", precision_score(devdf.label.values, preds_task1))
print("recall： ", recall_score(devdf.label.values, preds_task1))
print("f1_score: ", f1_score(devdf.label.values, preds_task1))

precision:  0.49765258215962443
recall：  0.5326633165829145
f1_score:  0.5145631067961165


# D. Downsampling with Tomek links then Random Oversampling

In [40]:
torch.manual_seed(6)
model = SentenceTransformer('all-distilroberta-v1')

# Generate embeddings for text data
text_embeddings = model.encode(traindf['text'].tolist())
embeddings_df = pd.DataFrame(text_embeddings)

tomek_links = TomekLinks()
X_resampled, y_resampled = tomek_links.fit_resample(embeddings_df, traindf['label'].tolist())

traindf_resampled = traindf.iloc[X_resampled.index].reset_index(drop=True)

In [41]:
torch.manual_seed(6)
traindf_majority = traindf_resampled[traindf_resampled['label'] == 0]
traindf_minority = traindf_resampled[traindf_resampled['label'] == 1]

# Oversample minority class
traindf_minority_oversampled = resample(traindf_minority,
                                   replace=True,
                                   n_samples=len(traindf_majority),
                                   random_state=42)

traindf_combined = pd.concat([traindf_majority, traindf_minority_oversampled])

traindf_combined = traindf_combined.sample(frac=1, random_state=42).reset_index(drop=True)

traindf_combined

Unnamed: 0,par_id,community,text,label
0,10408,homeless,""""""" Most of them ( the homeless ) have the abi...",1
1,9913,in-need,The Government is trying to encourage more fam...,1
2,1360,in-need,"ASSISS , established under the Assunta Foundat...",0
3,1614,refugee,"""Byrs stressed """" the appalling situation """" i...",0
4,7947,migrant,Rohingya Muslim refugees from Myanmar have alr...,0
...,...,...,...,...
11159,7298,migrant,"""Despite his claims to having a """" great heart...",1
11160,1810,immigrant,"""Castillo is thrilled that her novel -- popula...",0
11161,2965,migrant,From refugee to runway : How migrants are tran...,0
11162,1594,migrant,"""During a fact-finding visit to Algeciras on S...",0


In [42]:
torch.manual_seed(6)
task1_model_args = ClassificationArgs(num_train_epochs=10,
                                      no_save=True,
                                      no_cache=True,
                                      overwrite_output_dir=True)
task1_model = ClassificationModel("roberta",
                                  'roberta-base',
                                  args = task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available)
# train model
task1_model.train_model(traindf_combined[['text', 'label']])
# run predictions
preds_task1, _ = task1_model.predict(devdf.text.tolist())

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/1396 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/1396 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/1396 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/1396 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/1396 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/1396 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/1396 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/1396 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/1396 [00:00<?, ?it/s]

Running Epoch 10 of 10:   0%|          | 0/1396 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

In [43]:
Counter(preds_task1)

Counter({0: 1958, 1: 136})

In [44]:
print("precision: ", precision_score(devdf.label.values, preds_task1))
print("recall： ", recall_score(devdf.label.values, preds_task1))
print("f1_score: ", f1_score(devdf.label.values, preds_task1))

precision:  0.6176470588235294
recall：  0.4221105527638191
f1_score:  0.5014925373134328


# E. Edited Nearest Neighbours

In [80]:
from imblearn.under_sampling import EditedNearestNeighbours
torch.manual_seed(6)
model = SentenceTransformer('all-distilroberta-v1')

# Generate embeddings for text data
text_embeddings = model.encode(traindf['text'].tolist())
embeddings_df = pd.DataFrame(text_embeddings)

enn = EditedNearestNeighbours(kind_sel="mode")
X_resampled, y_resampled = enn.fit_resample(embeddings_df, traindf['label'].tolist())

traindf_resampled = traindf.iloc[X_resampled.index].reset_index(drop=True)

In [81]:
traindf_resampled

Unnamed: 0,par_id,community,text,label
0,6477,refugee,Tensions remain high at Australia 's island re...,0
1,4850,poor-families,"""Picariello , also known as """" Emperor Pic , ""...",0
2,2473,refugee,"""Irungu , who also serves as the head of the N...",0
3,5344,disabled,"Mrs. Aviva Dankner , owner of Castra Mall , al...",0
4,7070,hopeless,2 : The ' Check Engine ' and ' ABS ' lights ca...,0
...,...,...,...,...
6002,829,migrant,Brazilian media reports said there were also d...,0
6003,2809,immigrant,The company recalled employees it thought migh...,0
6004,2963,migrant,The spoken portion of the evening begins with ...,0
6005,1656,immigrant,The anti-immigrant Slovenian Democratic Party ...,0


In [82]:
traindf_resampled['label'].value_counts()

0    5428
1     579
Name: label, dtype: int64

In [83]:
torch.manual_seed(6)
task1_model_args = ClassificationArgs(num_train_epochs=10,
                                      no_save=True,
                                      no_cache=True,
                                      overwrite_output_dir=True)
task1_model = ClassificationModel("roberta",
                                  'roberta-base',
                                  args = task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available)
# train model
task1_model.train_model(traindf_resampled[['text', 'label']])
# run predictions
preds_task1, _ = task1_model.predict(devdf.text.tolist())

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/12 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/751 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/751 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/751 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/751 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/751 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/751 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/751 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/751 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/751 [00:00<?, ?it/s]

Running Epoch 10 of 10:   0%|          | 0/751 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

In [84]:
Counter(preds_task1)

Counter({0: 1924, 1: 170})

In [85]:
print("precision: ", precision_score(devdf.label.values, preds_task1))
print("recall： ", recall_score(devdf.label.values, preds_task1))
print("f1_score: ", f1_score(devdf.label.values, preds_task1))

precision:  0.5764705882352941
recall：  0.49246231155778897
f1_score:  0.5311653116531165


# F. Oversampling then Edited Nearest Neighbours

In [86]:
torch.manual_seed(6)
traindf_majority = traindf[traindf['label'] == 0]
traindf_minority = traindf[traindf['label'] == 1]

# Oversample minority class
traindf_minority_oversampled = resample(traindf_minority,
                                   replace=True,
                                   n_samples=len(traindf_majority),
                                   random_state=42)

traindf_oversampled = pd.concat([traindf_majority, traindf_minority_oversampled])

traindf_oversampled = traindf_oversampled.sample(frac=1, random_state=42).reset_index(drop=True)

traindf_oversampled

Unnamed: 0,par_id,community,text,label
0,4567,refugee,"""Brussels - Top European Union official Donald...",0
1,8819,refugee,Who blame for this issue ? ? The system itself...,1
2,4960,in-need,The rehabilitation of the health center is und...,1
3,5852,hopeless,""""""" The people of Khyber Pakhtunkhwa are resil...",1
4,4754,women,NAN also reports that the initiative provides ...,0
...,...,...,...,...
11367,7332,refugee,UN app lets you know what life as a refugee is...,1
11368,1810,immigrant,"""Castillo is thrilled that her novel -- popula...",0
11369,2965,migrant,From refugee to runway : How migrants are tran...,0
11370,1594,migrant,"""During a fact-finding visit to Algeciras on S...",0


In [87]:
torch.manual_seed(6)
model = SentenceTransformer('all-distilroberta-v1')

# Generate embeddings for text data
text_embeddings = model.encode(traindf_oversampled['text'].tolist())
embeddings_df = pd.DataFrame(text_embeddings)

enn = EditedNearestNeighbours(kind_sel="mode")
X_resampled, y_resampled = enn.fit_resample(embeddings_df, traindf_oversampled['label'].tolist())

traindf_combined = traindf_oversampled.iloc[X_resampled.index].reset_index(drop=True)

In [88]:
traindf_combined

Unnamed: 0,par_id,community,text,label
0,4567,refugee,"""Brussels - Top European Union official Donald...",0
1,8819,refugee,Who blame for this issue ? ? The system itself...,1
2,4960,in-need,The rehabilitation of the health center is und...,1
3,5852,hopeless,""""""" The people of Khyber Pakhtunkhwa are resil...",1
4,4754,women,NAN also reports that the initiative provides ...,0
...,...,...,...,...
11364,833,homeless,Park Up For Homes was started by a group of Ma...,0
11365,5460,hopeless,Once again the stateless Rohingya are on the r...,1
11366,10191,poor-families,""""""" What poor families really need is more mon...",1
11367,7332,refugee,UN app lets you know what life as a refugee is...,1


In [None]:
torch.manual_seed(6)
task1_model_args = ClassificationArgs(num_train_epochs=10,
                                      no_save=True,
                                      no_cache=True,
                                      overwrite_output_dir=True)
task1_model = ClassificationModel("roberta",
                                  'roberta-base',
                                  args = task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available)
# train model
task1_model.train_model(traindf_combined[['text', 'label']])
# run predictions
preds_task1, _ = task1_model.predict(devdf.text.tolist())

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/22 [00:00<?, ?it/s]

In [61]:
Counter(preds_task1)

Counter({0: 1962, 1: 132})

In [62]:
print("precision: ", precision_score(devdf.label.values, preds_task1))
print("recall： ", recall_score(devdf.label.values, preds_task1))
print("f1_score: ", f1_score(devdf.label.values, preds_task1))

precision:  0.6515151515151515
recall：  0.4321608040201005
f1_score:  0.5196374622356495


# G. ENN then RO

In [63]:
torch.manual_seed(1)
model = SentenceTransformer('all-distilroberta-v1')

# Generate embeddings for text data
text_embeddings = model.encode(traindf['text'].tolist())
embeddings_df = pd.DataFrame(text_embeddings)

enn = EditedNearestNeighbours(kind_sel="mode")
X_resampled, y_resampled = enn.fit_resample(embeddings_df, traindf['label'].tolist())

traindf_resampled = traindf.iloc[X_resampled.index].reset_index(drop=True)
traindf_resampled

Unnamed: 0,par_id,community,text,label
0,6477,refugee,Tensions remain high at Australia 's island re...,0
1,4850,poor-families,"""Picariello , also known as """" Emperor Pic , ""...",0
2,2473,refugee,"""Irungu , who also serves as the head of the N...",0
3,5344,disabled,"Mrs. Aviva Dankner , owner of Castra Mall , al...",0
4,7070,hopeless,2 : The ' Check Engine ' and ' ABS ' lights ca...,0
...,...,...,...,...
6002,829,migrant,Brazilian media reports said there were also d...,0
6003,2809,immigrant,The company recalled employees it thought migh...,0
6004,2963,migrant,The spoken portion of the evening begins with ...,0
6005,1656,immigrant,The anti-immigrant Slovenian Democratic Party ...,0


In [64]:
traindf_resampled['label'].value_counts()

0    5428
1     579
Name: label, dtype: int64

In [65]:
torch.manual_seed(6)
traindf_majority = traindf_resampled[traindf_resampled['label'] == 0]
traindf_minority = traindf_resampled[traindf_resampled['label'] == 1]

# Oversample minority class
traindf_minority_oversampled = resample(traindf_minority,
                                   replace=True,
                                   n_samples=len(traindf_majority),
                                   random_state=42)

traindf_combined = pd.concat([traindf_majority, traindf_minority_oversampled])

traindf_combined = traindf_combined.sample(frac=1, random_state=42).reset_index(drop=True)

traindf_combined

Unnamed: 0,par_id,community,text,label
0,4537,refugee,"By creating special observance days , the Unit...",0
1,7842,homeless,""""""" In the main it was a financial decision bu...",1
2,5274,vulnerable,The African Health Markets aims to increase co...,0
3,2798,homeless,Poverty has caused thousands of Jamaicans to b...,1
4,2788,immigrant,Immigration has always been a central part of ...,0
...,...,...,...,...
10851,2581,in-need,The ten wheelchairs were sought for children i...,1
10852,1810,immigrant,"""Castillo is thrilled that her novel -- popula...",0
10853,2965,migrant,From refugee to runway : How migrants are tran...,0
10854,1594,migrant,"""During a fact-finding visit to Algeciras on S...",0


In [66]:
traindf_combined['label'].value_counts()

0    5428
1    5428
Name: label, dtype: int64

In [67]:
torch.manual_seed(6)
task1_model_args = ClassificationArgs(num_train_epochs=10,
                                      no_save=True,
                                      no_cache=True,
                                      overwrite_output_dir=True)
task1_model = ClassificationModel("roberta",
                                  'roberta-base',
                                  args = task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available)
# train model
task1_model.train_model(traindf_combined[['text', 'label']])
# run predictions
preds_task1, _ = task1_model.predict(devdf.text.tolist())

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/21 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/1357 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/1357 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/1357 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/1357 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/1357 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/1357 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/1357 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/1357 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/1357 [00:00<?, ?it/s]

Running Epoch 10 of 10:   0%|          | 0/1357 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

In [68]:
Counter(preds_task1)

Counter({0: 1943, 1: 151})

In [69]:
print("precision: ", precision_score(devdf.label.values, preds_task1))
print("recall： ", recall_score(devdf.label.values, preds_task1))
print("f1_score: ", f1_score(devdf.label.values, preds_task1))

precision:  0.5894039735099338
recall：  0.4472361809045226
f1_score:  0.5085714285714286


# Baseline RoBERTa

In [70]:
# downsample negative instances
torch.manual_seed(6)
pcldf = traindf[traindf.label==1]
npos = len(pcldf)

training_set1 = pd.concat([pcldf,traindf[traindf.label==0][:npos*2]])

In [71]:
torch.manual_seed(6)
task1_model_args = ClassificationArgs(num_train_epochs=10,
                                      no_save=True,
                                      no_cache=True,
                                      overwrite_output_dir=True)
task1_model = ClassificationModel("roberta",
                                  'roberta-base',
                                  args = task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available)
# train model
task1_model.train_model(training_set1[['text', 'label']])
# run predictions
preds_task1, _ = task1_model.predict(devdf.text.tolist())

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/224 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/224 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/224 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/224 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/224 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/224 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/224 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/224 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/224 [00:00<?, ?it/s]

Running Epoch 10 of 10:   0%|          | 0/224 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

In [72]:
Counter(preds_task1)

Counter({0: 1712, 1: 382})

In [73]:
print("precision: ", precision_score(devdf.label.values, preds_task1))
print("recall： ", recall_score(devdf.label.values, preds_task1))
print("f1_score: ", f1_score(devdf.label.values, preds_task1))

precision:  0.387434554973822
recall：  0.7437185929648241
f1_score:  0.5094664371772806
