# Binary Classification of the "Don't Patronize Me!" Dataset

Perform Binary Classification to predict whether a text contains patronizing and condescending language. The task was task 4 (subtask 1) in the SemEval 2022 competition.

## Initialize Environment

In [17]:
# Library imports
import pandas as pd
import matplotlib.pyplot as plt

from nltk import PorterStemmer, WordNetLemmatizer
import codecs

import torch
from torch.utils.data import DataLoader, Dataset, Sampler
from tqdm import tqdm
import torch.nn as nn

import re

import numpy as np
from numpy import dot
from numpy.linalg import norm



# import torch.optim as optim
# import torch.nn.functional as F

def fix_seed(seed=420.69):
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
#   np.random.seed(seed)
#   random.seed(seed)

fix_seed()

data_path = '../data'
embeddings_path = '../word_embeddings'


In [2]:
if not torch.cuda.is_available():
  print('WARNING: You may want to change the runtime to GPU for faster training!')
  DEVICE = 'cpu'
else:
  DEVICE = 'cuda:0'

print(DEVICE)

cuda:0


In [22]:
h_params = {
    'model': 'None'
    , 'batch_size': 128
    , 'embedding_dimensions': 50
}

## Raw Data

### Loading

Load the data into pandas dataframes

In [5]:
# Import Data
train_data_path = f'{data_path}/dontpatronizeme_pcl.tsv'
test_data_path  = f'{data_path}/task4_test.tsv'

train_data = pd.read_csv(train_data_path, delimiter='\t', skiprows=4, header=None, names=['par_id','art_id','keyword','country_code', 'text','label'])
test_data  = pd.read_csv(test_data_path,  delimiter='\t', skiprows=4, header=None, names=['par_id','art_id','keyword','country_code', 'text'])

train_data = train_data.drop(['art_id'], axis=1)
test_data = test_data.drop(['art_id'], axis=1)

In [6]:
# Concatenate label information to train data
dev_label_path   = f'{data_path}/dev_semeval_parids-labels.csv'
train_label_path = f'{data_path}/train_semeval_parids-labels.csv'

dev_label   = pd.read_csv(dev_label_path, delimiter=',')
train_label = pd.read_csv(train_label_path, delimiter=',')

detailed_labels = pd.concat([dev_label, train_label], ignore_index=True, join='inner', names=['simple', 'detailed'])
train_data = pd.merge(train_data, detailed_labels, on='par_id')
train_data = train_data.rename(columns={'label_x': 'label', 'label_y': 'label_detailed'})

train_data = train_data.drop('par_id', axis=1)
test_data = test_data.drop('par_id', axis=1)

In [7]:
# Add Binary Classification column for ease of checking
train_data.loc[:, 'is_patronizing'] = False
train_data.loc[train_data['label'].isin([2,3,4]), 'is_patronizing'] = True

In [8]:
train_data = train_data.dropna(subset = 'text')
test_data = test_data.dropna(subset = 'text')

## Pre-Processing

#### Standardize Input

The idea is, that we have variable sentence lengths. For implementation of the networks coming up ahead, we need tensors to be of constant dimensions. Therefore, we must pad sentences which are too short. Therefore, we provide an alternative implementation of DataLoaders that also pre-process the data

In [None]:
# 1. Tokenize input
def get_tokenized_corpus(corpus):
  tokenized_corpus = []

  for sentence in corpus:
    tokenized_sentence = []
    for token in sentence.split(' '):
      tokenized_sentence.append(token)
    tokenized_corpus.append(tokenized_sentence)

  return tokenized_corpus

In [None]:
# 2. Create mappings from word to index
def get_word2idx(tokenized_corpus):
  vocabulary = []
  for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

  word2idx = {w: idx+2 for (idx, w) in enumerate(vocabulary)}
  word2idx['<pad>'] = 0      # padding tokens assigned to index 0.
  word2idx['<unknown>'] = 1  # unknown tokens are assigned to index 1. (we can, during training, mask some of the inputs to <unknown> so the model knows what to do with these tokens also)

  return word2idx

In [None]:
class PreProcessor(Dataset):
    def __init__(self, chain_of_transformations):
        """PreProcessor which takes a list of functions which are chainable in the order provided that will transform the data."""
        self.chain_of_transformations = chain_of_transformations

    def 
    

In [24]:
class LabelledDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        data = {'keyword'     : self.data['keyword'].iloc[idx],
                'country_code': self.data['country_code'].iloc[idx],
                'text'        : self.data['text'].iloc[idx]}
        
        label = {'label'        : self.data['label'].iloc[idx],
                'label_detailed': self.data['label_detailed'].iloc[idx],
                'is_patronizing': self.data['is_patronizing'].iloc[idx]}

        return data, label
    
class WithheldDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        data = {'keyword'     : self.data['keyword'].iloc[idx],
                'country_code': self.data['country_code'].iloc[idx],
                'text'        : self.data['text'].iloc[idx]}
        
        return data

train_loader = DataLoader(dataset=LabelledDataset(train_data), batch_size=h_params['batch_size'], shuffle=True)
test_loader = DataLoader(dataset=WithheldDataset(train_data), batch_size=h_params['batch_size'], shuffle=True)

### Pre-trained embeddings

In [25]:
# Define Embedding Class
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dimension):
        super(Encoder, self)
        # create embedding object to store mappings from words to embedded vectors
        self.tokenEmbeddings = nn.Embedding(num_embeddings=vocab_size
                                          , embedding_dim=embedding_dimension
                                          , padding_idx=0)
    
    def forward(self, x):
        """Generates the embeddings for each word in the input, of size: (batch_size, max_length of training sample)"""
        embedded = self.embedding(x)
        embedded = embedded.unsqueeze(1)

        return embedded

## Model

In [23]:
# Download the pre-trained model for word_embeddings
# This part is similar to the tutorial

# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip 1/glove.6B.zip

In [None]:
# Load Pre-trained Embeddings from GloVe

word2idx = 

w2i = [] # word2index
i2w = [] # index2word
wvecs = [] # word vectors

# this is a large file, it will take a while to load in the memory!
with codecs.open(f'glove.6B.{h_params['embedding_dimensions']}d.txt', 'r','utf-8') as f:
  index = 0
  for line in tqdm(f.readlines()):
    # Ignore the first line - first line typically contains vocab, dimensionality
    if len(line.strip().split()) > 3:

      (word, vec) = (line.strip().split()[0],
                     list(map(float,line.strip().split()[1:])))

      wvecs.append(vec)
      w2i.append((word, index))
      i2w.append((index, word))
      index += 1

w2i = dict(w2i)
i2w = dict(i2w)
wvecs = np.array(wvecs)

