# Binary Classification of the "Don't Patronize Me!" Dataset

Perform Binary Classification to predict whether a text contains patronizing and condescending language. The task was task 4 (subtask 1) in the SemEval 2022 competition.

## Initialize Environment

In [48]:
# Library imports
import pandas as pd
import matplotlib.pyplot as plt

from nltk import PorterStemmer, WordNetLemmatizer
import codecs

import torch
from torch.utils.data import DataLoader, Dataset, Sampler

import re

# import torch.nn as nn
# import torch.optim as optim
# import torch.nn.functional as F

def fix_seed(seed=420.69):
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
#   np.random.seed(seed)
#   random.seed(seed)

fix_seed()

data_path = '../data'
embeddings_path = '../word_embeddings'


In [3]:
if not torch.cuda.is_available():
  print('WARNING: You may want to change the runtime to GPU for faster training!')
  DEVICE = 'cpu'
else:
  DEVICE = 'cuda:0'

print(DEVICE)

cuda:0


In [4]:
h_params = {
    'model': 'None',
    'batch_size': 128
}

In [50]:
# Download the pre-trained model for word_embeddings
# This part is similar to the tutorial

# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip 1/glove.6B.zip

UsageError: Line magic function `%%bash` not found.


## Data

### Loading

Load the data into pandas dataframes

In [5]:
# Import Data
train_data_path = f'{data_path}/dontpatronizeme_pcl.tsv'
test_data_path  = f'{data_path}/task4_test.tsv'

train_data = pd.read_csv(train_data_path, delimiter='\t', skiprows=4, header=None, names=['par_id','art_id','keyword','country_code', 'text','label'])
test_data  = pd.read_csv(test_data_path,  delimiter='\t', skiprows=4, header=None, names=['par_id','art_id','keyword','country_code', 'text'])

train_data = train_data.drop(['art_id'], axis=1)
test_data = test_data.drop(['art_id'], axis=1)

In [6]:
# Concatenate label information to train data
dev_label_path   = f'{data_path}/dev_semeval_parids-labels.csv'
train_label_path = f'{data_path}/train_semeval_parids-labels.csv'

dev_label   = pd.read_csv(dev_label_path, delimiter=',')
train_label = pd.read_csv(train_label_path, delimiter=',')

detailed_labels = pd.concat([dev_label, train_label], ignore_index=True, join='inner', names=['simple', 'detailed'])
train_data = pd.merge(train_data, detailed_labels, on='par_id')
train_data = train_data.rename(columns={'label_x': 'label', 'label_y': 'label_detailed'})

train_data = train_data.drop('par_id', axis=1)
test_data = test_data.drop('par_id', axis=1)

In [7]:
# Add Binary Classification column for ease of checking
train_data.loc[:, 'is_patronizing'] = False
train_data.loc[train_data['label'].isin([2,3,4]), 'is_patronizing'] = True

In [8]:
train_data = train_data.dropna(subset = 'text')
test_data = test_data.dropna(subset = 'text')

### DataLoaders

In [45]:
class TrainDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

        self.stemming = False
        self.lemmatization = False
        self.word_embeddings = False

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        text = list(filter(None, re.split('[\s,/.\']', self.data['text'].iloc[idx])))

        if self.stemming:           
            porter = PorterStemmer()
            text = list(map(porter.stem, text))

        if self.lemmatization:
            wordnet_lemmatizer = WordNetLemmatizer()
            text = list(map(wordnet_lemmatizer.lemmatize, text))

        if self.word_embeddings:
            text = apply_word_embeddings(text)

        data = {'keyword'     : self.data['keyword'].iloc[idx],
                'country_code': self.data['country_code'].iloc[idx],
                'text'        : text}
        
        label = {'label'        : self.data['label'].iloc[idx],
                'label_detailed': self.data['label_detailed'].iloc[idx],
                'is_patronizing': self.data['is_patronizing'].iloc[idx]}

        return data, label
    
    def enable_stemming(self):
        self.stemming = True
        return self
    
    def enable_lemmatization(self):
        self.lemmatization = True
        return self
    
    def enable_word_embeddings(self):
        self.word_embeddings = True
        return self

    def apply_word_embeddings(text: str):
        w2i = [] # word2index
        i2w = [] # index2word
        wvecs = [] # word vectors

        # this is a large file, it will take a while to load in the memory!
        with codecs.open(f'{embeddings_path}/glove.6B.50d.txt', 'r','utf-8') as f:
            index = 0
            for line in tqdm(f.readlines()):
                # Ignore the first line - first line typically contains vocab, dimensionality
                if len(line.strip().split()) > 3:

                (word, vec) = (line.strip().split()[0],
                                list(map(float,line.strip().split()[1:])))

                wvecs.append(vec)
                w2i.append((word, index))
                i2w.append((index, word))
                index += 1

        w2i = dict(w2i)
        i2w = dict(i2w)
        wvecs = np.array(wvecs)

        return None

    def collate_fn(data_batch, label_batch):
        from torch.nn.utils.rnn import pad_sequence

        # Pad text sequences in the batch
        padded_text = pad_sequence([torch.tensor(data['text']) for data in data_batch], batch_first=True, padding_value=0)

        data_batch_padded = {
            'keyword': torch.tensor([data['keyword'] for data in data_batch]),
            'country_code': torch.tensor([data['country_code'] for data in data_batch]),
            'text': padded_text
        }

        return data_batch_padded, label_batch

        

# Assuming train_data is your DataFrame
custom_dataset = TrainDataset(train_data).enable_lemmatization()
train_loader = DataLoader(dataset=custom_dataset, batch_size=h_params['batch_size'], shuffle=True, collate_fn=custom_dataset.collate_fn)

for test, label in train_loader:
    print(len(test))
    print(len(label))
    print(test['text'])

TypeError: tuple indices must be integers or slices, not str