In [33]:
import sys, os, collections, copy, re
import nltk, codecs
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from util import *
from PairWiseRanker import *

In [2]:
from IPython.display import clear_output

In [3]:
UNKNOWN_WORD = ':-)'

In [4]:
data_dir = 'data/'
fn = {'trn':data_dir + 'WikiQA-train.tsv', 'dev':data_dir + 'WikiQA-dev.tsv', 'test':data_dir + 'WikiQA-test.tsv'}

In [5]:
#read_data = lambda fn : pd.read_csv(fn, sep='\t', header=0, index_col='QuestionID')
def read_data(f):
    data = pd.read_csv(f, sep='\t', header=0)
    data['QAID'] = data['QuestionID'] + '*' + data['SentenceID']
    data.index = data['QAID']
    for col in ['Question', 'Sentence']:
        data[col] = map(lambda x : codecs.decode(x.lower(), 'UTF-8'), data[col])
    questions = dict(zip(data['QuestionID'], data['Question']))
    return data, questions

In [6]:
data, questions = {}, {}
for k in fn:
    data[k], questions[k] = read_data(fn[k])
    print k, data[k].shape, len(set(data[k]['QAID']))

test (2349, 7) 2349
dev (1130, 8) 1130
trn (20347, 8) 20347


In [7]:
data_trn, data_dev, data_test = data['trn'], data['dev'], data['test'] # for convienience

In [8]:
# distribution of number of answers each question has
for k in ['trn', 'dev']:
    grp = data[k]['Label'].groupby(data[k]['QuestionID'])
    print k, '\n', grp.agg(lambda v : v.sum()).value_counts()

trn 
0    1245
1     744
2     103
3      20
7       2
5       1
6       1
4       1
Name: Label, dtype: int64
dev 
1    115
2      9
4      1
3      1
Name: Label, dtype: int64


In [9]:
t = set(questions['trn']), set(questions['dev']), set(questions['test'])
print len(t[0] | t[1] | t[2]), sum([len(v) for v in t])

2486 2486


In [10]:
# qtokens = {k:{qid:nltk.word_tokenize(q) for qid, q in questions[k].items() } for k in fn }

In [11]:
regex_tokenizer = RegexpTokenizer(r'\w+')
lancaster = nltk.LancasterStemmer()
def sentence2words(sent):
    res = {}
    res['tokens'] = nltk.word_tokenize(sent)
    res['words'] = regex_tokenizer.tokenize(sent)
    res['stems'] = [lancaster.stem(w) for w in res['words']]
    res['words_not_stop'] = filter(lambda word : word not in stopwords.words('english'), res['words'])
    res['stems_not_stop'] = [lancaster.stem(w) for w in res['words_not_stop']]
    return res

In [12]:
tps = ['words', 'stems', 'words_not_stop', 'stems_not_stop']

In [13]:
sent_words = dict()

In [14]:
for k in fn:
    for qid, q in questions[k].items():
        sent_words[qid] = sentence2words(q)
    d = data[k]
    for qaid in d.index:
        sent_words[qaid] = sentence2words(d.get_value(qaid, 'Sentence'))

In [15]:
#first_word = lambda sent : sent[:sent.find(' ')]
first_token = lambda tokens : tokens[0]
second_token = lambda tokens : UNKNOWN_WORD if len(tokens) == 1 else tokens[1]
first_two_tokens = lambda tokens : ' '.join(tokens[:2])

In [16]:
for k in fn:
    for col, func in [('token1', first_token), ('token2', second_token), ('token12', first_two_tokens)]:
        qids = list(set(data[k]['QuestionID']))
        tmp = Series(map(func, map(lambda qid : sent_words[qid]['words'], qids)), index=qids)
        data[k][col] = tmp[data[k]['QuestionID']].values

In [17]:
def get_question_type(data_x):
    qtype = Series(data_x['token1'], index=data_x.index)
    qtype[data_x['token12'] == 'what year'] = 'time'
    qtype[data_x['token1'] == 'when'] = 'time'
    qtype[data_x['token12'] == 'how many'] = 'number'
    return qtype

In [18]:
for k in fn:
    data[k]['qtype'] = get_question_type(data[k])

In [19]:
for k in fn:
    print k, '\n'
    for tk in ['token1', 'token12']:
        print data[k][tk].value_counts().head(10) / float(data[k].shape[0]) * 100
    print '\n'

test 

what      49.510430
time      12.260536
who       11.579395
how       11.196254
where     10.429970
number     5.023414
Name: token1, dtype: float64
what is      22.307365
what are      5.832269
when did      5.704555
how many      5.023414
what year     3.490847
where was     2.937420
where is      2.894849
what does     2.724564
how did       2.256279
who is        2.256279
Name: token12, dtype: float64


dev 

what        52.654867
time        15.486726
where       10.000000
who          9.646018
number       6.725664
how          5.309735
whatever     0.176991
Name: token1, dtype: float64
what is      14.424779
what are     10.265487
when did      9.203540
how many      6.725664
where is      5.132743
what does     4.070796
what was      3.982301
what year     3.539823
who is        2.743363
when was      2.743363
Name: token12, dtype: float64


trn 

what      48.695139
how       13.358235
who       12.576793
time      10.212808
where      7.647319
number     7.509707
Name:

In [20]:
features = {key:DataFrame({}, index=data[key].index) for key in fn}
features_trn = features['trn']
features_dev = features['dev']
features_test = features['test']

In [21]:
# words, stems
for k in fn:
    d = data[k]
    for tp in tps:
        func = lambda qaid : count_num_cooccur(sent_words[qaid][tp], sent_words[d.get_value(qaid, 'QuestionID')][tp])
        features[k][tp] = d['QAID'].apply(func)

In [22]:
# question type
for k in fn:
    f, d = features[k], data[k]
    f['qtype_time'] = (d['qtype'] == 'time') + 0
    f['qtype_number'] = (d['qtype'] == 'number') + 0
    f['year'] = map(lambda sent : int(re.search('[0-9]{4}', sent) != None), d.Sentence)
    f['number'] = map(lambda sent : int(re.search('[0-9]+', sent) != None), d.Sentence)
    f['number'] = f['number'] - f['year']

In [23]:
print features_trn.year.value_counts(), '\n', features_trn.number.value_counts()

0    15788
1     4559
Name: year, dtype: int64 
0    17958
1     2389
Name: number, dtype: int64


In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [77]:
from learning2rank.rank import RankNet, ListNet
reload(RankNet)
reload(ListNet)

<module 'learning2rank.rank.ListNet' from 'learning2rank/rank/ListNet.py'>

In [185]:
models = {}

In [186]:
indiv = [49, 'entropy', 13, 3, 5]
models['rf'] = model_rf = RandomForestClassifier(n_estimators=indiv[0], criterion=indiv[1], 
                                                 max_depth=indiv[2], min_samples_split=indiv[3], 
                                                 min_samples_leaf=indiv[4], n_jobs=-1) # 

In [187]:
models['lr_l1'] = model_lr_l1 = LogisticRegression(penalty='l1', n_jobs=-1)
models['lr_l2'] = model_lr_l2 = LogisticRegression(penalty='l2', n_jobs=-1)

In [29]:
models['svm'] = model_svm = SVC(probability=True)

In [87]:
rk_models = {}

In [88]:
rk_models['ranknet'] = rk_model_ranknet = RankNet.RankNet(silent=True)

In [89]:
#rk_models['listnet'] = rk_model_listnet = ListNet.ListNet(silent=True)

In [205]:
my_models = {}
pair_ranker = PairWiseRanker(data_trn.QuestionID, features_trn, data_trn.Label)

In [215]:
indiv = [746, 'gini', 7, 6, 4] # 0.679 
my_models['my_rf'] = my_models_rf = RandomForestClassifier(n_estimators=indiv[0], criterion=indiv[1], 
                                                 max_depth=indiv[2], min_samples_split=indiv[3], 
                                                 min_samples_leaf=indiv[4], n_jobs=-1)

In [60]:
rk_models['listnet'] = rk_model_listnet = ListNet.ListNet()

In [188]:
for mn, md in models.items():
    md.fit(pd.get_dummies(features_trn).values, data_trn.Label)

In [90]:
# %%capture
for mn in rk_models: # ['listnet']: # 
    args = {}
    if mn == 'ranknet':
        args = {} # {'batchsize':100, 'n_iter':5000, 'n_units1':512, 'n_units2':128, 'tv_ratio':0.95}
    elif mn == 'listnet':
        args = {'batchsize':100, 'n_epoch':1, 'n_units1':32, 'n_units2':16, 'tv_ratio':0.9}
    rk_models[mn].fit(pd.get_dummies(features_trn).values, data_trn.Label, **args)

# clear_output()

  0%|          | 7/5000 [00:00<01:15, 65.70it/s]

load dataset
('The number of data, train:', 19329, 'validate:', 1018)
prepare initialized model!


100%|██████████| 5000/5000 [00:36<00:00, 135.83it/s]


save the model
save the optimizer


In [216]:
for mn in my_models:
    pair_ranker.fit(my_models[mn], mn)

In [47]:
#Series(model_rf.feature_importances_, index=features_trn.columns).sort_values(ascending=False)

In [192]:
sfn = ['dev']

In [193]:
preds = {k:{mn:md.predict_proba(pd.get_dummies(features[k])) for mn, md in models.items()} for k in sfn}

In [194]:
rk_preds = {k:{mn:md.predict(pd.get_dummies(features[k]).values) for mn, md in rk_models.items()} for k in sfn}

In [217]:
my_preds = {k:{mn:pair_ranker.predict(data[k]['QuestionID'], features[k], mn) for mn in my_models} for k in sfn}

In [218]:
#rk_preds['dev']['listnet']

In [219]:
ranks = {k:copy.deepcopy(features[k]) for k in fn}
rank_trn, rank_dev, rank_test = ranks['trn'], ranks['dev'], ranks['test']
for k in sfn:
    for col in ['QuestionID', 'SentenceID']:
        ranks[k][col] = data[k][col]
    if k != 'test':
        ranks[k]['Label'] = data[k]['Label']
    for mn in models:
        ranks[k][mn] = preds[k][mn][:, 1] # larger is better
    for mn in rk_models:
        ranks[k][mn] = rk_preds[k][mn]
    for mn in my_models:
        ranks[k][mn] = my_preds[k][mn]
    grp = ranks[k].groupby('QuestionID')
    for col in tps + models.keys() + rk_models.keys() + my_models.keys():
        ranks[k]['rank_' + col] = grp[col].rank(method='min', ascending=False).astype(int)    

In [220]:
for k in sfn:
    print k
    rk = ranks[k]
    for tp in tps + models.keys() + rk_models.keys() + my_models.keys():
        rk.sort_values(['QuestionID', 'rank_' + tp], inplace=True)
        grp = rk.Label.groupby(rk.QuestionID)
        print '\t', '{: <15}'.format(tp), np.round([MAP(grp), MRR(grp), MAP(grp, keep_no_ans=False), MRR(grp, keep_no_ans=False)], 5)

dev
	words           [ 0.66772  0.67436  0.66772  0.67436]
	stems           [ 0.66973  0.67913  0.66973  0.67913]
	words_not_stop  [ 0.65747  0.66276  0.65747  0.66276]
	stems_not_stop  [ 0.67028  0.67904  0.67028  0.67904]
	rf              [ 0.6447   0.65302  0.6447   0.65302]
	lr_l1           [ 0.65321  0.66196  0.65321  0.66196]
	lr_l2           [ 0.65321  0.66196  0.65321  0.66196]
	ranknet         [ 0.65613  0.66154  0.65613  0.66154]
	my_rf           [ 0.67083  0.67863  0.67083  0.67863]


In [222]:
col = 'rank_' + 'my_rf'
rank_dev[['QuestionID', 'SentenceID', col]].sort_values(['QuestionID', col]).to_csv('rank.txt', sep='\t', header=False, index=False)

In [104]:
class t(object):
    x = 1
    y = [x]
    def __init__(self, v):
        print v + t.y[0]

In [105]:
t(10)

11


<__main__.t at 0x7f7b73608790>

In [108]:
x = collections.Counter()

<dictionary-keyiterator at 0x7f7b75c5f2b8>

In [110]:
y = data_dev.Label

In [111]:
d = DataFrame({}, index=data_dev.index)

In [112]:
d['x'] = range(len(data_dev.index))

In [113]:
d['y']  = y

In [118]:
y.head()

QAID
Q11*D11-0    0
Q11*D11-1    0
Q11*D11-2    0
Q11*D11-3    1
Q11*D11-4    1
Name: Label, dtype: int64

In [115]:
d.sort_values('y', inplace=True)

In [116]:
d.head()

Unnamed: 0_level_0,x,y
QAID,Unnamed: 1_level_1,Unnamed: 2_level_1
Q11*D11-0,0,0
Q1833*D1734-6,707,0
Q1833*D1734-7,708,0
Q1833*D1734-8,709,0
Q1840*D1740-1,711,0


In [251]:
reload(base)
reload(creator)
reload(tools)

<module 'deap.tools' from '/home/pkuas/chenqy/anaconda2/lib/python2.7/site-packages/deap/tools/__init__.pyc'>

In [262]:
# g = RFGA(data_trn.QuestionID, features_trn, data_trn.Label, data_dev.QuestionID, features_dev, data_dev.Label)

In [264]:
#g.run(NPOP=10, NGEN=10, CXPB=0.5, MUTPB=0.5)