In [411]:
import sys, os, collections, copy, re
import nltk, codecs
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from util import *

In [89]:
UNKNOWN_WORD = ':-)'

In [71]:
data_dir = 'data/'
fn = {'trn':data_dir + 'WikiQA-train.tsv', 'dev':data_dir + 'WikiQA-dev.tsv', 'test':data_dir + 'WikiQA-test.tsv'}

In [272]:
#read_data = lambda fn : pd.read_csv(fn, sep='\t', header=0, index_col='QuestionID')
def read_data(f):
    data = pd.read_csv(f, sep='\t', header=0)
    data['QAID'] = data['QuestionID'] + '*' + data['SentenceID']
    data.index = data['QAID']
    for col in ['Question', 'Sentence']:
        data[col] = map(lambda x : codecs.decode(x.lower(), 'UTF-8'), data[col])
    questions = dict(zip(data['QuestionID'], data['Question']))
    return data, questions

In [273]:
data, questions = {}, {}
for k in fn:
    data[k], questions[k] = read_data(fn[k])
    print k, data[k].shape, len(set(data[k]['QAID']))

test (2349, 7) 2349
dev (1130, 8) 1130
trn (20347, 8) 20347


In [377]:
data_trn, data_dev, data_test = data['trn'], data['dev'], data['test'] # for convienience

In [383]:
# distribution of number of answers each question has
for k in ['trn', 'dev']:
    grp = data[k]['Label'].groupby(data[k]['QuestionID'])
    print k, '\n', grp.agg(lambda v : v.sum()).value_counts()

trn 
0    1245
1     744
2     103
3      20
7       2
5       1
6       1
4       1
Name: Label, dtype: int64
dev 
1    115
2      9
4      1
3      1
Name: Label, dtype: int64


In [274]:
t = set(questions['trn']), set(questions['dev']), set(questions['test'])
print len(t[0] | t[1] | t[2]), sum([len(v) for v in t])

2486 2486


In [242]:
# qtokens = {k:{qid:nltk.word_tokenize(q) for qid, q in questions[k].items() } for k in fn }

In [275]:
regex_tokenizer = RegexpTokenizer(r'\w+')
lancaster = nltk.LancasterStemmer()
def sentence2words(sent):
    res = {}
    res['tokens'] = nltk.word_tokenize(sent)
    res['words'] = regex_tokenizer.tokenize(sent)
    res['words_not_stop'] = filter(lambda word : word not in stopwords.words('english'), res['words'])
    res['stems'] = [lancaster.stem(w) for w in res['words_not_stop']]
    return res

In [276]:
sent_words = dict()

In [277]:
for k in fn:
    for qid, q in questions[k].items():
        sent_words[qid] = sentence2words(q)
    d = data[k]
    for qaid in d.index:
        sent_words[qaid] = sentence2words(d.get_value(qaid, 'Sentence'))

In [281]:
#first_word = lambda sent : sent[:sent.find(' ')]
first_token = lambda tokens : tokens[0]
second_token = lambda tokens : UNKNOWN_WORD if len(tokens) == 1 else tokens[1]
first_two_tokens = lambda tokens : ' '.join(tokens[:2])

In [282]:
for k in fn:
    for col, func in [('token1', first_token), ('token2', second_token), ('token12', first_two_tokens)]:
        tmp = Series(map(func, qtokens[k].values()), index=qtokens[k].keys())
        data[k][col] = tmp[data[k]['QuestionID']].values

In [354]:
data['trn'].head()

Unnamed: 0_level_0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label,QAID,token1,token2,token12
QAID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Q1*D1-0,Q1,how are glacier caves formed?,D1,Glacier cave,D1-0,a partly submerged glacier cave on perito more...,0,Q1*D1-0,how,are,how are
Q1*D1-1,Q1,how are glacier caves formed?,D1,Glacier cave,D1-1,the ice facade is approximately 60 m high,0,Q1*D1-1,how,are,how are
Q1*D1-2,Q1,how are glacier caves formed?,D1,Glacier cave,D1-2,ice formations in the titlis glacier cave,0,Q1*D1-2,how,are,how are
Q1*D1-3,Q1,how are glacier caves formed?,D1,Glacier cave,D1-3,a glacier cave is a cave formed within the ice...,1,Q1*D1-3,how,are,how are
Q1*D1-4,Q1,how are glacier caves formed?,D1,Glacier cave,D1-4,"glacier caves are often called ice caves , but...",0,Q1*D1-4,how,are,how are


In [284]:
for k in fn:
    print k, '\n'
    print data[k]['token1'].value_counts(), data[k]['token2'].value_counts().head(10), '\n'

test 

what     1245
how       381
who       272
where     245
when      206
Name: token1, dtype: int64 is       674
did      205
are      171
was      157
does     134
many     118
year      82
wrote     51
do        42
much      41
Name: token2, dtype: int64 

dev 

what        635
how         136
when        135
where       113
who         109
whatever      2
Name: token1, dtype: int64 is      252
did     143
are     126
was      99
many     76
does     46
year     40
won      31
part     25
kind     25
Name: token2, dtype: int64 

trn 

what     10107
how       4246
who       2559
when      1879
where     1556
Name: token1, dtype: int64 is         5678
was        1736
did        1602
many       1528
are        1324
does       1243
do          492
much        355
year        199
country     189
Name: token2, dtype: int64 



In [346]:
features = {key:DataFrame({}, index=data[key].index) for key in fn}
features_trn = features['trn']
features_dev = features['dev']
features_test = features['test']

In [347]:
for k in fn:
    d = data[k]
    func = lambda qaid : count_word_cooccurrence(sent_words[qaid]['stems'], sent_words[d.get_value(qaid, 'QuestionID')]['stems'])
    features[k]['num_cooccur'] = d['QAID'].apply(func)

In [353]:
data[k].columns

Index([u'QuestionID', u'Question', u'DocumentID', u'DocumentTitle',
       u'SentenceID', u'Sentence', u'QAID', u'token1', u'token2', u'token12'],
      dtype='object')

In [424]:
for k in fn:
    for col in ['QuestionID', 'SentenceID']:
        features[k][col] = data[k][col]
    if k != 'test':
        features[k]['Label'] = data[k]['Label']
    features[k]['rank'] = features[k].groupby('QuestionID')['num_cooccur'].rank(method='min', ascending=False).astype(int)
    features[k].sort_values(['QuestionID', 'rank'], inplace=True)

In [425]:
features_trn.head(6)

Unnamed: 0_level_0,num_cooccur,QuestionID,Label,rank,SentenceID
QAID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Q1*D1-2,3,Q1,0,1,D1-2
Q1*D1-3,3,Q1,1,1,D1-3
Q1*D1-0,2,Q1,0,3,D1-0
Q1*D1-4,2,Q1,0,3,D1-4
Q1*D1-1,0,Q1,0,5,D1-1
Q10*D10-0,3,Q10,0,1,D10-0


In [429]:
for k in ['trn', 'dev']:
    ft = features[k]
    grp = ft.Label.groupby(ft.QuestionID)
    print k, MAP(grp), MRR(grp)

trn 0.272069444101 0.27696773753
dev 0.678348481325 0.686779930828


In [427]:
features_dev[['QuestionID', 'SentenceID', 'rank']].to_csv('rank.txt', sep='\t', header=False, index=False)