In [1]:
import pandas as pd
import numpy as np
import codecs 
import json
import matplotlib.pyplot as plt 
%matplotlib inline

In [3]:
dicts = []
with codecs.open('/home/agrigorev/git-projects/allen-qa/lucene-features-2.json', 'r', 'utf-8') as f:
    for line in f:
        dicts.append(json.loads(line))


In [4]:
questions = pd.DataFrame(dicts)

In [5]:
def extract_doc_features(docs):
    top = docs[:3]
    names = [d['title'] for d in top]
    all_scores = [d['score'] for d in docs]
    if not all_scores:
        all_scores = [0]

    top_scores = [d['score'] for d in top]
    if not top_scores:
        top_scores = [0]

    median_score = np.median(all_scores)
    mean_score = np.mean(all_scores)
    min_score = np.min(all_scores)

    score_1 = np.max(top_scores)
    score_2 = np.median(top_scores)
    score_3 = np.min(top_scores)

    return pd.Series((names, median_score, mean_score, min_score, score_1, score_2, score_3))

def names(pref):
    res = ['names', 'median_score', 'mean_score', 'min_score', 'score_1', 'score_2', 'score_3']
    return [pref + '_' + n for n in res]

In [6]:
questions.columns

Index([u'answer', u'answerLetter', u'ck12EbookAnswerResult', u'ck12EbookBothQADocCount', u'ck12EbookBothQADocCountAMustHave', u'ck12EbookKendallTauCorr', u'ck12EbookQuestionResult', u'ck12EbookSpearmanCorr', u'ck12WikiAnswerResult', u'ck12WikiBothQADocCount', u'ck12WikiBothQADocCountAMustHave', u'ck12WikiKendallTauCorr', u'ck12WikiQuestionResult', u'ck12WikiSpearmanCorr', u'label', u'ngramsAnswer', u'ngramsCk12EbookAnswerResult', u'ngramsCk12EbookBothQADocCount', u'ngramsCk12EbookBothQADocCountAMustHave', u'ngramsCk12EbookKendallTauCorr', u'ngramsCk12EbookQuestionResult', u'ngramsCk12EbookSpearmanCorr', u'ngramsCk12WikiAnswerResult', u'ngramsCk12WikiBothQADocCount', u'ngramsCk12WikiBothQADocCountAMustHave', u'ngramsCk12WikiKendallTauCorr', u'ngramsCk12WikiQuestionResult', u'ngramsCk12WikiSpearmanCorr', u'ngramsQuestion', u'question', u'questionId', u'rawAnswer', u'rawQuestion', u'source', u'type', u'word2vecAnswer', u'word2vecCosine', u'word2vecMissing', u'word2vecQuestion'], dtype='ob

In [7]:
questions[names('ck_12_answer')] = questions.ck12EbookAnswerResult.apply(extract_doc_features)
questions[names('ck_12_question')] = questions.ck12EbookQuestionResult.apply(extract_doc_features)

In [8]:
questions[names('wiki_answer')] = questions.ck12WikiAnswerResult.apply(extract_doc_features)
questions[names('wiki_question')] = questions.ck12WikiQuestionResult.apply(extract_doc_features)

In [19]:
questions[names('ck_12_ngram_answer')] = questions.ngramsCk12EbookAnswerResult.apply(extract_doc_features)
questions[names('ck_12_ngram_question')] = questions.ngramsCk12EbookQuestionResult.apply(extract_doc_features)

In [20]:
# something is wrong with ngram wiki, skipping it for now

In [22]:
questions.word2vecAnswer = questions.word2vecAnswer.apply(lambda x: np.array(x).astype('float'))
questions.word2vecQuestion = questions.word2vecQuestion.apply(lambda x: np.array(x).astype('float'))
questions.word2vecCosine = questions.word2vecCosine.astype('float')

In [21]:
def NaNs_to_zeros(vec):
    if np.isnan(vec).all():
        return np.zeros_like(vec)
    else:
        return vec

In [23]:
questions.word2vecAnswer = questions.word2vecAnswer.apply(NaNs_to_zeros)
questions.word2vecQuestion = questions.word2vecQuestion.apply(NaNs_to_zeros)
questions.word2vecCosine[questions.word2vecCosine.isnull()] = 0.0

In [28]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from scipy import sparse
from collections import Counter

In [29]:
train = questions[questions.source == 'TRAIN']

In [56]:
train_f1 = list(train.apply(lambda x:'%s' % (x['question']),axis=1))
train_f2 = list(train.apply(lambda x:'%s' % (x['answer']),axis=1))
train_f3 = list(train.apply(lambda x:'%s' % (x['type']),axis=1))

In [57]:
tfv1 = TfidfVectorizer(input=u'content', encoding=u'utf-8', decode_error=u'strict', 
                       strip_accents=None, lowercase=True, preprocessor=None, 
                       tokenizer=None, analyzer=u'word', stop_words=None, 
                       token_pattern=u'(?u)\\b\\w\\w+\\b', ngram_range=(1, 2), max_df=1.0, min_df=1)
train_f1 = tfv1.fit_transform(train_f1)

tfv2 = TfidfVectorizer(input=u'content', encoding=u'utf-8', decode_error=u'strict', 
                       strip_accents=None, lowercase=True, preprocessor=None, 
                       tokenizer=None, analyzer=u'word', stop_words=None, 
                       token_pattern=u'(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), max_df=1.0, min_df=1)
train_f2 = tfv2.fit_transform(train_f2)

tfv3 = TfidfVectorizer(input=u'content', encoding=u'utf-8', decode_error=u'strict', 
                       strip_accents=None, lowercase=True, preprocessor=None, 
                       tokenizer=None, analyzer=u'word', stop_words=None, 
                       token_pattern=u'(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), max_df=1.0, min_df=1)
train_f3 = tfv3.fit_transform(train_f3)

In [58]:
X_w2v = np.column_stack((np.vstack(train.word2vecQuestion), 
                     np.vstack(train.word2vecAnswer),
                     train.word2vecCosine))

X = sparse.hstack((X_w2v, train_f1, train_f2, train_f3))

In [33]:
y = (train.label == 'true').astype(int)

In [34]:
aucs = []

for i in range(10):
    xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(X, y, stratify=y)

    clf = RandomForestClassifier(n_jobs=-1, n_estimators=100)
    clf.fit(xtrain, ytrain)
    auc = roc_auc_score(ytest, clf.predict_proba(xtest)[:,0])
    aucs.append(auc)
    print i, auc

np.mean(aucs), np.std(aucs)

0 0.665064106667
1 0.657302613333
2 0.684780373333
3 0.677547946667
4 0.66137088
5 0.682840746667
6 0.679188053333
7 0.67444736
8 0.687162453333
9 0.67910272


(0.67488072533333343, 0.0097147205101174184)

In [35]:
from sklearn.decomposition import randomized_svd
from sklearn.decomposition import NMF
from sklearn.random_projection import GaussianRandomProjection

In [36]:
f3 = sparse.hstack((train_f1, train_f2))
f3.shape

In [38]:
from sklearn.preprocessing import Normalizer

def svd(X, K):
    normalizer = Normalizer(copy=False)
    _, _, Vt = randomized_svd(X, n_components=K)
    X_red = X.dot(Vt.T)
    X_red = normalizer.fit_transform(X_red)
    return Vt, normalizer, X_red

In [46]:
_, _, f12_lsa = svd(f3, 100)

In [47]:
X_2 = sparse.hstack((X_w2v, train_f1, train_f2, train_f3, f12_lsa))

In [48]:
aucs = []

for i in range(10):
    xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(X_2, y, stratify=y)

    clf = RandomForestClassifier(n_jobs=-1, n_estimators=100)
    clf.fit(xtrain, ytrain)
    auc = roc_auc_score(ytest, clf.predict_proba(xtest)[:,0])
    aucs.append(auc)
    print i, auc

np.mean(aucs), np.std(aucs)

0 0.665908906667
1 0.660366933333
2 0.677654613333
3 0.6459136
4 0.668356693333
5 0.654816853333
6 0.663981226667
7 0.680082773333
8 0.6553152
9 0.659414613333


(0.66318114133333339, 0.0099102097107178558)

LSA is not useful

In [52]:
def identity_analyzer(x):
    return x

In [53]:
tf_ck12_wiki_a = TfidfVectorizer(analyzer=identity_analyzer, min_df=1)
train_ck12_wiki_ans_docs = tf_ck12_wiki_a.fit_transform(train.ck_12_answer_names)

tf_ck12_wiki_q = TfidfVectorizer(analyzer=identity_analyzer, min_df=1)
train_ck12_wiki_q_docs = tf_ck12_wiki_q.fit_transform(train.ck_12_question_names)

In [59]:
X_3 = sparse.hstack((X_w2v, train_f1, train_f2, train_f3, train_ck12_wiki_ans_docs, train_ck12_wiki_q_docs))

In [60]:
aucs = []

for i in range(10):
    xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(X_3, y, stratify=y)

    clf = RandomForestClassifier(n_jobs=-1, n_estimators=100)
    clf.fit(xtrain, ytrain)
    auc = roc_auc_score(ytest, clf.predict_proba(xtest)[:,0])
    aucs.append(auc)
    print i, auc

np.mean(aucs), np.std(aucs)

0 0.6610752
1 0.673678933333
2 0.67387136
3 0.693143466667
4 0.687525546667
5 0.663442773333
6 0.674038186667
7 0.6738944
8 0.664942933333
9 0.6731008


(0.67387136000000003, 0.0095431860244120301)

In [73]:
X_4 = sparse.hstack((train_ck12_wiki_ans_docs, train_ck12_wiki_q_docs))

In [74]:
aucs = []

for i in range(10):
    xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(X_4, y, stratify=y)

    clf = RandomForestClassifier(n_jobs=-1, n_estimators=100)
    clf.fit(xtrain, ytrain)
    auc = roc_auc_score(ytest, clf.predict_proba(xtest)[:,0])
    aucs.append(auc)
    print i, auc

np.mean(aucs), np.std(aucs)

0 0.621911893333
1 0.629597866667
2 0.651472213333
3 0.62739584
4 0.64610944
5 0.629339306667
6 0.644045226667
7 0.629289813333
8 0.61450624
9 0.620079786667


(0.6313747626666667, 0.011448613946852457)

In [84]:
ck12_doc_features = sparse.hstack([train_ck12_wiki_ans_docs, train_ck12_wiki_q_docs])
_, _, ck12_doc_lsa = svd(ck12_doc_features, 100)

In [77]:
aucs = []

for i in range(10):
    xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(ck12_doc_lsa, y, stratify=y)

    clf = RandomForestClassifier(n_jobs=-1, n_estimators=100)
    clf.fit(xtrain, ytrain)
    auc = roc_auc_score(ytest, clf.predict_proba(xtest)[:,0])
    aucs.append(auc)
    print i, auc

np.mean(aucs), np.std(aucs)

0 0.637835946667
1 0.628809386667
2 0.6250176
3 0.630808746667
4 0.648206506667
5 0.641179733333
6 0.617064533333
7 0.641742506667
8 0.618552746667
9 0.620676266667


(0.63098939733333337, 0.010286674028220841)

In [82]:
ck_12_num_feature_names = names('ck_12_question')[1:] + names('ck_12_answer')[1:] 
ck_12_num_features = train[ck_12_num_feature_names]

In [80]:
X_5 = np.hstack([ck12_doc_lsa, ck_12_num_features.values])

aucs = []

for i in range(10):
    xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(X_5, y, stratify=y)

    clf = RandomForestClassifier(n_jobs=-1, n_estimators=100)
    clf.fit(xtrain, ytrain)
    auc = roc_auc_score(ytest, clf.predict_proba(xtest)[:,0])
    aucs.append(auc)
    print i, auc

np.mean(aucs), np.std(aucs)

0 0.61666816
1 0.63328896
2 0.632398506667
3 0.609560746667
4 0.624852906667
5 0.622763093333
6 0.623492693333
7 0.63824512
8 0.6222592
9 0.61336704


(0.6236896426666666, 0.0086001907230712753)

In [85]:
X_5 = np.hstack([X_w2v, ck12_doc_lsa, ck_12_num_features.values])

aucs = []

for i in range(10):
    xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(X_5, y, stratify=y)

    clf = RandomForestClassifier(n_jobs=-1, n_estimators=100)
    clf.fit(xtrain, ytrain)
    auc = roc_auc_score(ytest, clf.predict_proba(xtest)[:,0])
    aucs.append(auc)
    print i, auc

np.mean(aucs), np.std(aucs)

0 0.609606826667
1 0.637312853333
2 0.608776533333
3 0.60228736
4 0.62870784
5 0.619857493333
6 0.612011946667
7 0.615172693333
8 0.62423168
9 0.627469226667


(0.61854344533333339, 0.010309686616197565)

In [87]:
X_5 = sparse.hstack([train_f1, train_f2, ck12_doc_lsa, ck_12_num_features.values])

aucs = []

for i in range(10):
    xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(X_5, y, stratify=y)

    clf = RandomForestClassifier(n_jobs=-1, n_estimators=100)
    clf.fit(xtrain, ytrain)
    auc = roc_auc_score(ytest, clf.predict_proba(xtest)[:,0])
    aucs.append(auc)
    print i, auc

np.mean(aucs), np.std(aucs)

0 0.719573333333
1 0.717218986667
2 0.73941376
3 0.73192832
4 0.725570133333
5 0.728451413333
6 0.719953493333
7 0.728755626667
8 0.7187456
9 0.742381653333


(0.72719923200000003, 0.0083092323366117122)