# NLP Coursework


## 0. Set-up

### Main imports

In [3]:
!pip install simpletransformers
!pip install tensorboardx



In [4]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs, MultiLabelClassificationModel, MultiLabelClassificationArgs
from urllib import request
import pandas as pd
import logging
import torch
import numpy as np

from collections import Counter
from ast import literal_eval
import string
import nltk
from nltk.corpus import stopwords

In [5]:
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# check gpu
cuda_available = torch.cuda.is_available()

print('Cuda available? ',cuda_available)

Cuda available?  True


In [6]:
if cuda_available:
  import tensorflow as tf
  # Get the GPU device name.
  device_name = tf.test.gpu_device_name()
  # The device name should look like the following:
  if device_name == '/device:GPU:0':
      print('Found GPU at: {}'.format(device_name))
  else:
      raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


### Fetching the Don't Patronize Me! data manager module

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
!wget https://raw.githubusercontent.com/CRLala/NLPLabs-2024/main/Dont_Patronize_Me_Trainingset/dontpatronizeme_categories.tsv
!wget https://raw.githubusercontent.com/CRLala/NLPLabs-2024/main/Dont_Patronize_Me_Trainingset/dontpatronizeme_pcl.tsv

--2024-03-04 18:55:20--  https://raw.githubusercontent.com/CRLala/NLPLabs-2024/main/Dont_Patronize_Me_Trainingset/dontpatronizeme_categories.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1342370 (1.3M) [text/plain]
Saving to: ‘dontpatronizeme_categories.tsv’


2024-03-04 18:55:21 (6.55 MB/s) - ‘dontpatronizeme_categories.tsv’ saved [1342370/1342370]

--2024-03-04 18:55:21--  https://raw.githubusercontent.com/CRLala/NLPLabs-2024/main/Dont_Patronize_Me_Trainingset/dontpatronizeme_pcl.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK

In [9]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [10]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/evaluation.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/evaluation.py


In [11]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

In [12]:
from dont_patronize_me import DontPatronizeMe
dpm = DontPatronizeMe('.', '.')
dpm.load_task1()
train_df=dpm.train_task1_df
train_df.head()

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0,0


# Load paragraph IDs

In [13]:
trids  = pd.read_csv('internal_train_par_ids.csv')
teids  = pd.read_csv('internal_dev_par_ids.csv')

print(trids)
print(teids)

      par_id
0       6477
1       4850
2       2473
3       5344
4       7070
...      ...
6276     679
6277    8333
6278     449
6279    2437
6280    6944

[6281 rows x 1 columns]
      par_id
0       4928
1       3847
2        279
3       1038
4       1879
...      ...
2089   10027
2090    6857
2091    5847
2092    3189
2093    6658

[2094 rows x 1 columns]


In [14]:
trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)

data=dpm.train_task1_df

data

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0,0
...,...,...,...,...,...,...,...
10464,10465,@@14297363,women,lk,"""Sri Lankan norms and culture inhibit women fr...",0,1
10465,10466,@@70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0,0
10466,10467,@@20282330,in-need,ng,""""""" She has one huge platform , and informatio...",1,3
10467,10468,@@16753236,hopeless,in,""""""" Anja Ringgren Loven I ca n't find a word t...",1,4


## Rebuild training set

In [15]:
rows = [] # will contain par_id, label and text
for idx in range(len(trids)):
  parid = trids.par_id[idx]

  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  orig_label = int(data.loc[data.par_id == parid].orig_label.values[0])
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'orig_label': orig_label,
      'label':label
  })

trdf1 = pd.DataFrame(rows)
trdf1

Unnamed: 0,par_id,community,text,orig_label,label
0,6477,refugee,Tensions remain high at Australia 's island re...,0,0
1,4850,poor-families,"""Picariello , also known as """" Emperor Pic , ""...",0,0
2,2473,refugee,"""Irungu , who also serves as the head of the N...",0,0
3,5344,disabled,"Mrs. Aviva Dankner , owner of Castra Mall , al...",0,0
4,7070,hopeless,2 : The ' Check Engine ' and ' ABS ' lights ca...,0,0
...,...,...,...,...,...
6276,679,in-need,"""Many argue this is inevitable -- that rising ...",0,0
6277,8333,refugee,"The mother of 6 , including 3 adopted children...",0,0
6278,449,disabled,HSE says it ca n't pay for services to help di...,0,0
6279,2437,in-need,A second T-Home project is being launched in t...,0,0


## Rebuild test set

In [16]:
rows = [] # will contain par_id, label and text
for idx in range(len(teids)):
  parid = teids.par_id[idx]

  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  orig_label = int(data.loc[data.par_id == parid].orig_label.values[0])
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'orig_label': orig_label,
      'label':label
  })

import random
random.shuffle(rows)

tedf1 = pd.DataFrame(rows)
tedf1

Unnamed: 0,par_id,community,text,orig_label,label
0,2227,hopeless,"""Hopelessness worse the crisis , the collapse ...",0,0
1,4352,immigrant,The nascent Irish community prospered in the b...,0,0
2,1887,women,"Ramsey , who has worked in the profession for ...",0,0
3,857,vulnerable,Is there a spy camera in that bathroom ? in Se...,0,0
4,7178,migrant,"The meeting is being held in Cucuta , a Colomb...",0,0
...,...,...,...,...,...
2089,3890,women,The women 's doubles title was captured by Mar...,0,0
2090,6555,migrant,"""Migrations lead to violence . The immigrant i...",0,0
2091,3893,hopeless,"""YOU SAY &gt; BAA ( like a sheep ) -- ROO -- D...",0,0
2092,3048,homeless,"At Umuezeata , near Ekeata , Ikeduru Local Cou...",0,0


## RoBERTa baseline

In [31]:
trdf1['orig_label'].value_counts()

0    5131
1     555
3     276
4     226
2      93
Name: orig_label, dtype: int64

In [32]:
# downsample negative instances
pcldf = trdf1[trdf1['orig_label'] == 1]

# 2. Count the number of instances with label 1
npos = len(pcldf)

# 3. Select a subset of instances with label 0 that matches the count of instances with label 1
ncldf = trdf1[trdf1['orig_label'] == 0].sample(n=npos, random_state=42)

# 4. Keep all instances with labels other than 0 and 1
other_labels_df = trdf1[~trdf1['orig_label'].isin([0, 1])]

# 5. Concatenate the selected instances of label 0, instances of label 1, and instances with other labels
training_set1 = pd.concat([pcldf, ncldf, other_labels_df])

# 6. Shuffle the resulting dataframe if needed
training_set1 = training_set1.sample(frac=1, random_state=42)

# Display the value counts to verify downsampling
print(training_set1['orig_label'].value_counts())

0    555
1    555
3    276
4    226
2     93
Name: orig_label, dtype: int64


In [33]:
task1_model_args = ClassificationArgs(num_train_epochs=10,
                                      no_save=True,
                                      no_cache=True,
                                      overwrite_output_dir=True)

task1_model = ClassificationModel("roberta",
                                  'roberta-base',
                                  args = task1_model_args,
                                  num_labels=5,
                                  use_cuda=cuda_available)
# train model
task1_model.train_model(training_set1[['text', 'orig_label']])
# run predictions
preds_task1, _ = task1_model.predict(tedf1.text.tolist())

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/214 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/214 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/214 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/214 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/214 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/214 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/214 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/214 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/214 [00:00<?, ?it/s]

Running Epoch 10 of 10:   0%|          | 0/214 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

In [34]:
Counter(preds_task1)

Counter({1: 726, 0: 1093, 2: 18, 4: 40, 3: 217})

In [35]:
Counter(tedf1.label.values)

Counter({0: 1895, 1: 199})

In [36]:
preds_task1

array([1, 1, 0, ..., 1, 0, 4])

In [37]:
preds = np.array([0 if num <= 1 else 1 for num in preds_task1])

In [38]:
Counter(preds)

Counter({0: 1819, 1: 275})

In [39]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

print("Accuracy: ", accuracy_score(tedf1.label.values, preds))
print("Precision: ", precision_score(tedf1.label.values, preds))
print("Recall: ", recall_score(tedf1.label.values, preds))
print("F1 Score: ", f1_score(tedf1.label.values, preds))

Accuracy:  0.9006685768863419
Precision:  0.48363636363636364
Recall:  0.6683417085427136
F1 Score:  0.5611814345991563


In [26]:
preds_play, _ = task1_model.predict(["The immigrants arrived in 2021", "Immigrants are reinforcing the economy so we must help them"])
preds_play

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

array([0, 0])

# Original model

In [40]:
# downsample negative instances
torch.manual_seed(6)
pcldf = trdf1[trdf1.label==1]
npos = len(pcldf)

training_set1 = pd.concat([pcldf,trdf1[trdf1.label==0][:npos*2]])

In [41]:
print(training_set1['label'].value_counts())

0    1190
1     595
Name: label, dtype: int64


In [28]:
torch.manual_seed(6)
task1_model_args = ClassificationArgs(num_train_epochs=10,
                                      no_save=True,
                                      no_cache=True,
                                      overwrite_output_dir=True)
task1_model = ClassificationModel("roberta",
                                  'roberta-base',
                                  args = task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available)
# train model
task1_model.train_model(training_set1[['text', 'label']])
# run predictions
preds_task1, _ = task1_model.predict(tedf1.text.tolist())

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/224 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/224 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/224 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/224 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/224 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/224 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/224 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/224 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/224 [00:00<?, ?it/s]

Running Epoch 10 of 10:   0%|          | 0/224 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

In [29]:
Counter(preds_task1)

Counter({1: 382, 0: 1712})

In [30]:
print("precision: ", precision_score(tedf1.label.values, preds_task1))
print("recall： ", recall_score(tedf1.label.values, preds_task1))
print("f1_score: ", f1_score(tedf1.label.values, preds_task1))

precision:  0.387434554973822
recall：  0.7437185929648241
f1_score:  0.5094664371772806
