In [1]:
import pandas as pd
import numpy as np
import re

In [None]:
import glob
glob.glob('biaseddata')

In [30]:
def read_process_data(train_path, dev_path, test_path):
    train = pd.read_csv(train_path, sep='\t', header=None)
    dev = pd.read_csv(dev_path, sep = '\t', header=None)
    test = pd.read_csv(test_path, sep='\t', header=None)
    
    print("Train length: {}".format(len(train)))
    print("Dev length: {}".format(len(dev)))
    print("Test length: {}".format(len(test)))
    #Rename columns
    train.columns = ['id', 'annot_old', 'annot_new', 'biased', 'unbiased', 'tags', 'roots']
    dev.columns = ['id', 'annot_old', 'annot_new', 'biased', 'unbiased', 'tags', 'roots']
    test.columns = ['id', 'annot_old', 'annot_new', 'biased', 'unbiased', 'tags', 'roots']
    
    #Process and recombine training data: 
    train_biased, dev_biased, test_biased = pd.DataFrame(train.biased), pd.DataFrame(dev.biased), pd.DataFrame(test.biased)
    train_unbiased, dev_unbiased, test_unbiased = pd.DataFrame(train.unbiased), pd.DataFrame(dev.unbiased), pd.DataFrame(test.unbiased)
    
    
    train_biased['label'], dev_biased['label'], test_biased['label'] = [1]*len(train_biased), [1]*len(dev_biased), [1]*len(test_biased)
    train_unbiased['label'], dev_unbiased['label'], test_unbiased['label'] = [0]*len(train_unbiased), [0]*len(dev_unbiased), [0]*len(test_unbiased)

    #Fix colnames
    train_biased.columns, dev_biased.columns, test_biased.columns = ['text', 'label'], ['text', 'label'], ['text', 'label']
    train_unbiased.columns, dev_unbiased.columns, test_unbiased.columns = ['text', 'label'], ['text', 'label'], ['text', 'label']
    #Combine
    train_all, dev_all, test_all = pd.concat([train_biased, train_unbiased]), pd.concat([dev_biased, dev_unbiased]), pd.concat([test_biased, test_unbiased])
    
    return train_all, dev_all, test_all

In [31]:
train, dev, test = read_process_data('bias_data/WNC/biased.word.train', 'bias_data/WNC/biased.word.dev', 'bias_data/WNC/biased.word.test')

Train length: 53803
Dev length: 700
Test length: 1000


In [32]:
def strip_punc(s):
    return re.sub("\W", " ", s).lower()

In [33]:
train['strip_text'] = train.text.apply(lambda x: strip_punc(x))
dev['strip_text'] = dev.text.apply(lambda x: strip_punc(x))
test['strip_text'] = test.text.apply(lambda x: strip_punc(x))

In [34]:
train = train[['strip_text', 'label']]
dev = dev[['strip_text', 'label']]
test = test[['strip_text', 'label']]

In [35]:
train.to_csv('processed_data/train_ann.csv', index=False)
dev.to_csv('processed_data/dev_ann.csv', index=False)
test.to_csv('processed_data/test_ann.csv')

In [36]:
train.head()

Unnamed: 0,strip_text,label
0,chloroform the molecular lifesaver an articl...,1
1,the free software gnu classpath project is onl...,1
2,other campaigners especially the controversia...,1
3,vocalist rob halford s performance is consider...,1
4,the proud general is a chinese animated featur...,1


In [37]:
#Add start and stop tokens. 
train['strip_text'] = train['strip_text'].apply(lambda x: "<start> " + x + " <stop>")
dev['strip_text'] = dev['strip_text'].apply(lambda x: "<start> " + x + " <stop>")
test['strip_text'] = test['strip_text'].apply(lambda x: "<start> "+ x + " <stop>")

In [38]:
train.strip_text[0].iloc[0]

'<start> chloroform  the molecular lifesaver  an article at oxford university providing interesting facts about chloroform  <stop>'

In [39]:
train = train[['strip_text', 'label']]
dev = dev[['strip_text', 'label']]
test = test[['strip_text', 'label']]

In [40]:
train.to_csv('processed_data/train_lstm.csv', index=False)
dev.to_csv('processed_data/dev_lstm.csv', index=False)
test.to_csv('processed_data/test_lstm.csv')

In [43]:
df = pd.read_csv('processed_data/train_lstm.csv')
df.tail(20)

Unnamed: 0,strip_text,label
107586,<start> the usb 3 0 computer cable standard ha...,0
107587,<start> alfredo oliva concert comedian con...,0
107588,<start> uak was occupied by the greek army bet...,0
107589,<start> karen taylor comedian born 1979 b...,0
107590,<start> babylon 5 is an american science ficti...,0
107591,<start> these books are considered to be part ...,0
107592,<start> in terms of public transport cumberna...,0
107593,<start> like the rest of nature man s mind is...,0
107594,<start> the band are popular for their 3d anim...,0
107595,<start> al najah secondary school is a seconda...,0
