In [6]:
import pandas as pd
import numpy as np
import re

In [3]:
def read_process_data(train_path, dev_path):
    train = pd.read_csv(train_path, sep='\t', header=None)
    dev = pd.read_csv(dev_path, sep = '\t', header=None)
    
    print("Train length: {}".format(len(train)))
    print("Dev length: {}".format(len(dev)))
    
    #Rename columns
    train.columns = ['id', 'annot_old', 'annot_new', 'biased', 'unbiased', 'tags', 'roots']
    dev.columns = ['id', 'annot_old', 'annot_new', 'biased', 'unbiased', 'tags', 'roots']
    
    #Process and recombine training data: 
    train_biased, dev_biased = pd.DataFrame(train.biased), pd.DataFrame(dev.biased)
    train_unbiased, dev_unbiased = pd.DataFrame(train.unbiased), pd.DataFrame(dev.unbiased)
    
    
    train_biased['label'], dev_biased['label'] = [1]*len(train_biased), [1]*len(dev_biased)
    train_unbiased['label'], dev_unbiased['label'] = [0]*len(train_unbiased), [0]*len(dev_unbiased)

    #Fix colnames
    train_biased.columns, dev_biased.columns = ['text', 'label'], ['text', 'label']
    train_unbiased.columns, dev_unbiased.columns = ['text', 'label'], ['text', 'label']
    #Combine
    train_all, dev_all = pd.concat([train_biased, train_unbiased]), pd.concat([dev_biased, dev_unbiased])
    
    return train_all, dev_all

In [4]:
train, dev = read_process_data('bias_data/WNC/biased.word.train', 'bias_data/WNC/biased.word.dev')

Train length: 53803
Dev length: 700


In [11]:
def strip_punc(s):
    return re.sub("\W", " ", s).lower()

In [14]:
train['strip_text'] = train.text.apply(lambda x: strip_punc(x))
dev['strip_text'] = dev.text.apply(lambda x: strip_punc(x))

In [15]:
train = train[['strip_text', 'label']]
dev = dev[['strip_text', 'label']]

In [17]:
train.to_csv('processed_data/train_ann.csv', index=False)
dev.to_csv('processed_data/dev_ann.csv', index=False)