## 0. Set-up

### Main imports

In [1]:
!pip install simpletransformers
!pip install tensorboardx

Collecting simpletransformers
  Downloading simpletransformers-0.70.0-py3-none-any.whl (315 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers)
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpletransformers)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting wandb>=0.10.32 (from simpletransformers)
  

In [2]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs, MultiLabelClassificationModel, MultiLabelClassificationArgs
from urllib import request
import pandas as pd
import logging
import torch

from collections import Counter
from ast import literal_eval
import string
import nltk
from nltk.corpus import stopwords
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [3]:
# check gpu
cuda_available = torch.cuda.is_available()

print('Cuda available? ',cuda_available)

Cuda available?  False


In [4]:
if cuda_available:
  import tensorflow as tf
  # Get the GPU device name.
  device_name = tf.test.gpu_device_name()
  # The device name should look like the following:
  if device_name == '/device:GPU:0':
      print('Found GPU at: {}'.format(device_name))
  else:
      raise SystemError('GPU device not found')

### Fetching the Don't Patronize Me! data manager module

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
!wget https://raw.githubusercontent.com/CRLala/NLPLabs-2024/main/Dont_Patronize_Me_Trainingset/dontpatronizeme_categories.tsv
!wget https://raw.githubusercontent.com/CRLala/NLPLabs-2024/main/Dont_Patronize_Me_Trainingset/dontpatronizeme_pcl.tsv

--2024-03-04 21:25:52--  https://raw.githubusercontent.com/CRLala/NLPLabs-2024/main/Dont_Patronize_Me_Trainingset/dontpatronizeme_categories.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1342370 (1.3M) [text/plain]
Saving to: ‘dontpatronizeme_categories.tsv’


2024-03-04 21:25:53 (16.4 MB/s) - ‘dontpatronizeme_categories.tsv’ saved [1342370/1342370]

--2024-03-04 21:25:53--  https://raw.githubusercontent.com/CRLala/NLPLabs-2024/main/Dont_Patronize_Me_Trainingset/dontpatronizeme_pcl.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK

In [7]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [8]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/evaluation.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/evaluation.py


In [9]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

In [10]:
from dont_patronize_me import DontPatronizeMe
dpm = DontPatronizeMe('.', '.')
dpm.load_task1()
train_df=dpm.train_task1_df
train_df.head()

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0,0


# Load paragraph IDs

In [11]:
trids = pd.read_csv('train_semeval_parids-labels.csv')
teids = pd.read_csv('dev_semeval_parids-labels.csv')

print(trids)
print(teids)

      par_id                  label
0       4341  [1, 0, 0, 1, 0, 0, 0]
1       4136  [0, 1, 0, 0, 0, 0, 0]
2      10352  [1, 0, 0, 0, 0, 1, 0]
3       8279  [0, 0, 0, 1, 0, 0, 0]
4       1164  [1, 0, 0, 1, 1, 1, 0]
...      ...                    ...
8370    8380  [0, 0, 0, 0, 0, 0, 0]
8371    8381  [0, 0, 0, 0, 0, 0, 0]
8372    8382  [0, 0, 0, 0, 0, 0, 0]
8373    8383  [0, 0, 0, 0, 0, 0, 0]
8374    8384  [0, 0, 0, 0, 0, 0, 0]

[8375 rows x 2 columns]
      par_id                  label
0       4046  [1, 0, 0, 1, 0, 0, 0]
1       1279  [0, 1, 0, 0, 0, 0, 0]
2       8330  [0, 0, 1, 0, 0, 0, 0]
3       4063  [1, 0, 0, 1, 1, 1, 0]
4       4089  [1, 0, 0, 0, 0, 0, 0]
...      ...                    ...
2089   10462  [0, 0, 0, 0, 0, 0, 0]
2090   10463  [0, 0, 0, 0, 0, 0, 0]
2091   10464  [0, 0, 0, 0, 0, 0, 0]
2092   10465  [0, 0, 0, 0, 0, 0, 0]
2093   10466  [0, 0, 0, 0, 0, 0, 0]

[2094 rows x 2 columns]


In [12]:
trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)

data=dpm.train_task1_df

data

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0,0
...,...,...,...,...,...,...,...
10464,10465,@@14297363,women,lk,"""Sri Lankan norms and culture inhibit women fr...",0,1
10465,10466,@@70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0,0
10466,10467,@@20282330,in-need,ng,""""""" She has one huge platform , and informatio...",1,3
10467,10468,@@16753236,hopeless,in,""""""" Anja Ringgren Loven I ca n't find a word t...",1,4


## Rebuild training set

In [13]:
rows = [] # will contain par_id, label and text
for idx in range(len(trids)):
  parid = trids.par_id[idx]
  #print(parid)
  # select row from original dataset to retrieve `text` and binary label
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })

trdf1 = pd.DataFrame(rows)
trdf1

Unnamed: 0,par_id,community,text,label
0,4341,poor-families,"The scheme saw an estimated 150,000 children f...",1
1,4136,homeless,Durban 's homeless communities reconciliation ...,1
2,10352,poor-families,The next immediate problem that cropped up was...,1
3,8279,vulnerable,Far more important than the implications for t...,1
4,1164,poor-families,To strengthen child-sensitive social protectio...,1
...,...,...,...,...
8370,8380,refugee,Rescue teams search for survivors on the rubbl...,0
8371,8381,hopeless,The launch of ' Happy Birthday ' took place la...,0
8372,8382,homeless,"The unrest has left at least 20,000 people dea...",0
8373,8383,hopeless,You have to see it from my perspective . I may...,0


## Rebuild test set

In [14]:
rows = [] # will contain par_id, label and text
for idx in range(len(teids)):
  parid = teids.par_id[idx]
  #print(parid)
  # select row from original dataset
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })

import random
random.shuffle(rows)

tedf1 = pd.DataFrame(rows)
tedf1

Unnamed: 0,par_id,community,text,label
0,10043,migrant,"Better wages , a healthier lifestyle and bette...",0
1,2453,disabled,""""""" My daughter , who was a physiotherapist , ...",1
2,7004,in-need,We have the opportunity to give the gift of lo...,1
3,10279,migrant,This branch of the military polices the most p...,0
4,8972,migrant,"""The expression of xenophobia explicit in an o...",0
...,...,...,...,...
2089,9236,hopeless,I say so is because one day they seem to be do...,0
2090,9349,vulnerable,"Between the 1950s and 1990s more than 100,000 ...",0
2091,9657,homeless,Two families in the northern New Territories a...,0
2092,8944,women,"""As difficult as things sometimes get between ...",0


## BoW baseline

In [15]:
tedf1.label.value_counts()

0    1895
1     199
Name: label, dtype: int64

In [16]:
# downsample negative instances
pcldf = trdf1[trdf1.label==1]
npos = len(pcldf)

training_set1 = pd.concat([pcldf,trdf1[trdf1.label==0][:npos*2]])
training_set1.label.value_counts()

0    1588
1     794
Name: label, dtype: int64

In [17]:
class BoWClassifier:
    def __init__(self):
        self.pipeline = Pipeline([
            ('preprocessor', Preprocessor()),  # Custom preprocessor
            ('vectorizer', CountVectorizer(binary=False)),  # Convert text to BoW representation
            ('classifier', LogisticRegression(max_iter=1000))  # Logistic Regression classifier
        ])

    def train(self, df):
        X = df['text']
        y = df['label']

        self.pipeline.fit(X, y)

    def predict(self, texts):
        return self.pipeline.predict(texts)

class Preprocessor:
    def __init__(self):
        nltk.download('stopwords')
        nltk.download('punkt')
        self.stop_words = set(stopwords.words('english'))

    def preprocess_text(self, text):
        # Convert text to lowercase
        text = text.lower()
        # Remove punctuation
        text = re.sub(r'[^\w\s]', '', text)
        # Tokenization
        tokens = nltk.word_tokenize(text)
        # Remove stopwords
        tokens = [token for token in tokens if token not in self.stop_words]
        # Join tokens back into a string
        preprocessed_text = ' '.join(tokens)
        return preprocessed_text

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self.preprocess_text(text) for text in X]


bow_classifier = BoWClassifier()
bow_classifier.train(training_set1)

# Example prediction for new texts
preds_task1 = bow_classifier.predict(tedf1["text"])
print(preds_task1)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


[0 1 1 ... 0 0 0]


In [18]:
Counter(preds_task1)

Counter({0: 1757, 1: 337})

In [19]:
print("accuracy: ", accuracy_score(tedf1.label.values, preds_task1))
print("precision: ", precision_score(tedf1.label.values, preds_task1))
print("recall： ", recall_score(tedf1.label.values, preds_task1))
print("f1_score: ", f1_score(tedf1.label.values, preds_task1))

accuracy:  0.8261700095510984
precision:  0.2551928783382789
recall：  0.4321608040201005
f1_score:  0.3208955223880597


Misclassifications

In [28]:
preds_task1[24]

0

In [29]:
tedf1['predicted_label'] = preds_task1
tedf1['prediction_match'] = [int(label == pred) for label, pred in zip(tedf1['label'], preds_task1)]
tedf1[tedf1['prediction_match']==0]

Unnamed: 0,par_id,community,text,label,prediction_match,predicted_label
12,2196,refugee,""""""" It 's the largest humanitarian tragedy of ...",1,0,0
24,3861,homeless,"""In time , when the housing backlog for the lo...",1,0,0
34,6234,poor-families,The World Health Organization did not give a r...,1,0,0
37,823,in-need,"""He said : """" We need improved security for ci...",1,0,0
48,8653,vulnerable,""""""" There are hundreds of charities across thi...",0,0,1
...,...,...,...,...,...,...
2055,1674,disabled,"Kyle really your a pig , lol youre also very i...",1,0,0
2058,9028,in-need,"The now-73-year-old man , who hails from Const...",0,0,1
2074,9560,poor-families,"Now , that 's all behind her , and she 's wise...",0,0,1
2075,4023,refugee,A highlight of the week will be a public lectu...,1,0,0


In [31]:
matched_df = tedf1[tedf1['prediction_match'] == 1]
mismatched_df = tedf1[tedf1['prediction_match'] == 0]

In [33]:
mismatched_df

Unnamed: 0,par_id,community,text,label,prediction_match,predicted_label
12,2196,refugee,""""""" It 's the largest humanitarian tragedy of ...",1,0,0
24,3861,homeless,"""In time , when the housing backlog for the lo...",1,0,0
34,6234,poor-families,The World Health Organization did not give a r...,1,0,0
37,823,in-need,"""He said : """" We need improved security for ci...",1,0,0
48,8653,vulnerable,""""""" There are hundreds of charities across thi...",0,0,1
...,...,...,...,...,...,...
2055,1674,disabled,"Kyle really your a pig , lol youre also very i...",1,0,0
2058,9028,in-need,"The now-73-year-old man , who hails from Const...",0,0,1
2074,9560,poor-families,"Now , that 's all behind her , and she 's wise...",0,0,1
2075,4023,refugee,A highlight of the week will be a public lectu...,1,0,0


In [34]:
from collections import Counter

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Define the preprocessing function
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenization
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Join tokens back into a string is not necessary for count calculation
    return tokens

# Function to apply preprocessing and create a token count DataFrame
def create_token_count_df(df):
    # Apply preprocessing to the text column
    preprocessed_texts = df['text'].apply(preprocess_text)

    # Flatten the list of tokens and count occurrences
    all_tokens = [token for sublist in preprocessed_texts for token in sublist]
    token_counts = Counter(all_tokens)

    # Create a DataFrame from the token counts
    token_count_df = pd.DataFrame(token_counts.items(), columns=['Token', 'Count']).sort_values(by='Count', ascending=False)

    return token_count_df

# Assuming 'text' is the column in your DataFrames that contains the text to be preprocessed
# Apply the function to each DataFrame
matched_token_count_df = create_token_count_df(matched_df)
mismatched_token_count_df = create_token_count_df(mismatched_df)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
matched_token_count_df

Unnamed: 0,Token,Count
242,women,348
375,said,345
103,immigrants,250
31,people,236
51,need,198
...,...,...
6057,transmit,1
6055,stepchildren,1
6053,parent,1
6052,greatgrandparents,1


In [36]:
mismatched_token_count_df

Unnamed: 0,Token,Count
20,people,106
27,families,105
61,children,95
59,poor,85
62,need,75
...,...,...
2128,sportpesa,1
2129,add,1
2130,beyond,1
2132,advertising,1


In [57]:
top_matched_tokens = matched_token_count_df.head(5)['Token'].tolist()
top_mismatched_tokens = mismatched_token_count_df.head(5)['Token'].tolist()

unique_mismatched_tokens = [token for token in top_mismatched_tokens if token not in top_matched_tokens]

# Print or use the list of unique tokens
print(unique_mismatched_tokens)

['families', 'children', 'poor']


In [56]:
idx = 11
print(mismatched_df["text"].iloc[idx])
print(mismatched_df["label"].iloc[idx])

The complaint says the victims should be treated properly and the government should announce compensation for them as they mostly come from poor families .
0


Dummy model

In [20]:
len_dummy_preds = len(tedf1)
dummy_preds = [0]*len_dummy_preds
Counter(dummy_preds)

Counter({0: 2094})

In [21]:
print("accuracy: ", accuracy_score(tedf1.label.values, dummy_preds))
print("precision: ", precision_score(tedf1.label.values, dummy_preds))
print("recall： ", recall_score(tedf1.label.values, dummy_preds))
print("f1_score: ", f1_score(tedf1.label.values, dummy_preds))

accuracy:  0.9049665711556829
precision:  0.0
recall：  0.0
f1_score:  0.0


  _warn_prf(average, modifier, msg_start, len(result))
