In [36]:
from urllib import request
import pandas as pd
import logging
import torch
from collections import Counter
from ast import literal_eval
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from transformers import RobertaTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import nlpaug.augmenter.word as naw
import inspect
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import string

In [37]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [38]:
from dont_patronize_me import DontPatronizeMe
dpm = DontPatronizeMe('.', '.')
dpm.load_task1()
original_train_data = dpm.train_task1_df
original_train_data

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0,0
...,...,...,...,...,...,...,...
10464,10465,@@14297363,women,lk,"""Sri Lankan norms and culture inhibit women fr...",0,1
10465,10466,@@70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0,0
10466,10467,@@20282330,in-need,ng,""""""" She has one huge platform , and informatio...",1,3
10467,10468,@@16753236,hopeless,in,""""""" Anja Ringgren Loven I ca n't find a word t...",1,4


In [39]:
def split_train_test(original_train_data, train_csv_path, test_csv_path):
    train_semeval_parids = pd.read_csv(train_csv_path)
    dev_semeval_parids = pd.read_csv(test_csv_path)

    train_semeval_parids['par_id'] = train_semeval_parids['par_id'].astype(str)
    dev_semeval_parids['par_id'] = dev_semeval_parids['par_id'].astype(str)

    train_data_df = pd.merge(train_semeval_parids['par_id'], original_train_data, on='par_id', how='left')
    test_data_df = pd.merge(dev_semeval_parids['par_id'], original_train_data, on='par_id', how='left')

    return train_data_df, test_data_df
    
train_csv_path = 'train_semeval_parids-labels.csv'
test_csv_path = 'dev_semeval_parids-labels.csv'

train_data_df, test_data_df = split_train_test(original_train_data, train_csv_path, test_csv_path)

In [40]:
def aug_demo(aug, texts):
    aug = naw.SynonymAug(aug_src='wordnet')

    for text in texts:
        augmented_text = aug.augment(text)[0]
        print(f"Original : {text}")
        print(f"Augmented: {augmented_text}")

example_texts = train_data_df[train_data_df['label'] == 1]['text'].values[:5]

In [41]:
aug_demo(naw.SynonymAug(), example_texts)

Original : The scheme saw an estimated 150,000 children from poor families being sent to parts of the British Empire between 1920 and 1974 , by religious orders and charities who said they would lead better lives .
Augmented: The scheme saw an estimated 150, 000 children from hapless families being send to part of the British Empire between 1920 and 1974, by religious orders and charities who say they would lead better lives.
Original : Durban 's homeless communities reconciliation lunch
Augmented: Durban ' s homeless person communities reconciliation luncheon
Original : The next immediate problem that cropped up was how to assist the unfortunate couple , as neither of them possessed a birth certificate , a marriage certificate , or even an identity card . The Samurdhi Officer Dhanapala lamented explaining how agonizing it was for him to bear , when he came across the majority of poor families in the village did not possess even an ID to assist them officially .
Augmented: The next imm

In [42]:
aug_demo(naw.AntonymAug(), example_texts)

Original : The scheme saw an estimated 150,000 children from poor families being sent to parts of the British Empire between 1920 and 1974 , by religious orders and charities who said they would lead better lives .
Augmented: The scheme saw an estimated 150, 000 tike from poor families being sent to parts of the British Imperium between 1920 and 1974, by spiritual orders and polemonium caeruleum world health organization said they would lead better lives.
Original : Durban 's homeless communities reconciliation lunch
Augmented: Durban ' due south homeless person communities reconciliation dejeuner
Original : The next immediate problem that cropped up was how to assist the unfortunate couple , as neither of them possessed a birth certificate , a marriage certificate , or even an identity card . The Samurdhi Officer Dhanapala lamented explaining how agonizing it was for him to bear , when he came across the majority of poor families in the village did not possess even an ID to assist the

In [43]:
aug_demo(naw.RandomWordAug(), example_texts)

Original : The scheme saw an estimated 150,000 children from poor families being sent to parts of the British Empire between 1920 and 1974 , by religious orders and charities who said they would lead better lives .
Augmented: The scheme saw an estimated cl, 000 children from poor families being sent to parts of the British Imperium between 1920 and 1974, by religious orders and greek valerian world health organization said they would lead better living.
Original : Durban 's homeless communities reconciliation lunch
Augmented: Durban ' second homeless person communities reconciliation lunch
Original : The next immediate problem that cropped up was how to assist the unfortunate couple , as neither of them possessed a birth certificate , a marriage certificate , or even an identity card . The Samurdhi Officer Dhanapala lamented explaining how agonizing it was for him to bear , when he came across the majority of poor families in the village did not possess even an ID to assist them offici

In [44]:
aug_demo(naw.WordAugmenter(action='insert'), example_texts)

Original : The scheme saw an estimated 150,000 children from poor families being sent to parts of the British Empire between 1920 and 1974 , by religious orders and charities who said they would lead better lives .
Augmented: The scheme saw an estimated 150, 000 children from pathetic families being post to parts of the British Empire between 1920 and 1974, by religious order and charities world health organization aver they would lead better lives.
Original : Durban 's homeless communities reconciliation lunch
Augmented: Durban ' atomic number 16 homeless community reconciliation tiffin
Original : The next immediate problem that cropped up was how to assist the unfortunate couple , as neither of them possessed a birth certificate , a marriage certificate , or even an identity card . The Samurdhi Officer Dhanapala lamented explaining how agonizing it was for him to bear , when he came across the majority of poor families in the village did not possess even an ID to assist them official

In [45]:
aug_demo(naw.WordAugmenter(action='substitute'), example_texts)

Original : The scheme saw an estimated 150,000 children from poor families being sent to parts of the British Empire between 1920 and 1974 , by religious orders and charities who said they would lead better lives .
Augmented: The scheme go out an estimated 150, 000 children from poor families equal send to parts of the British Imperium between 1920 and 1974, by spiritual fiat and charities who say they would lead better lives.
Original : Durban 's homeless communities reconciliation lunch
Augmented: Durban ' sulphur homeless communities reconciliation luncheon
Original : The next immediate problem that cropped up was how to assist the unfortunate couple , as neither of them possessed a birth certificate , a marriage certificate , or even an identity card . The Samurdhi Officer Dhanapala lamented explaining how agonizing it was for him to bear , when he came across the majority of poor families in the village did not possess even an ID to assist them officially .
Augmented: The next imm

In [7]:
def augment_positive_samples(df):
    augmenters = [
        naw.SynonymAug(),
        naw.AntonymAug(),
    ]

    augmented_dfs = [df]

    for aug in augmenters:
        aug_samples = df[df['label'] == 1].copy()
        aug_samples['text'] = aug_samples['text'].apply(lambda x: aug.augment(x)[0])
        augmented_dfs.append(aug_samples)

    result_df = pd.concat(augmented_dfs, ignore_index=True)
    return result_df

augmented_train_data_df = augment_positive_samples(train_data_df)

In [8]:
augmented_train_data_df

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,4341,@@17139403,poor-families,gb,"The scheme saw an estimated 150,000 children f...",1,4
1,4136,@@22273328,homeless,za,Durban 's homeless communities reconciliation ...,1,2
2,10352,@@21102155,poor-families,lk,The next immediate problem that cropped up was...,1,4
3,8279,@@21220476,vulnerable,nz,Far more important than the implications for t...,1,2
4,1164,@@14727121,poor-families,gh,To strengthen child-sensitive social protectio...,1,4
...,...,...,...,...,...,...,...
9958,873,@@20374243,poor-families,sg,Citing the fact that these kids who be born at...,1,2
9959,10070,@@15573661,disabled,ng,Fern? ndez differ a badly - ignore philanthrop...,1,2
9960,6484,@@2559173,homeless,lk,"Touched much by their plight, Commanding Offic...",1,2
9961,6249,@@1947926,women,gh,She reiterated her ministry ' s commitment to ...,1,2


In [18]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/andreitudor/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andreitudor/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/andreitudor/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [22]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [35]:
def preprocess_text(text):
    
    # Removing punctuation
    text = re.sub(f'[{string.punctuation}]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '[NUM]', text)
    
    # Removing stopwords
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in stopwords.words('english')]
    
    # Reduce words to root form
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    text = ' '.join(tokens)
    return text

train_data_df['text'] = train_data_df['text'].apply(preprocess_text)
train_data_df


Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,4341,@@17139403,poor-families,gb,scheme saw estimated NUM child poor family sen...,1,4
1,4136,@@22273328,homeless,za,Durban homeless community reconciliation lunch,1,2
2,10352,@@21102155,poor-families,lk,next immediate problem cropped assist unfortun...,1,4
3,8279,@@21220476,vulnerable,nz,Far important implication Economy Gods dollar ...,1,2
4,1164,@@14727121,poor-families,gh,strengthen childsensitive social protection sy...,1,4
...,...,...,...,...,...,...,...
8370,8380,@@3172947,refugee,gb,Rescue team search survivor rubble building fo...,0,0
8371,8381,@@23593795,hopeless,ke,launch Happy Birthday took place last Saturday...,0,0
8372,8382,@@9222273,homeless,pk,unrest left least NUM people dead Nigeria made...,0,0
8373,8383,@@25979256,hopeless,pk,see perspective may journalist strictest sense...,0,0
