In [1]:
import pandas as pd
import numpy as np
import codecs 
import json
import matplotlib.pyplot as plt 
%matplotlib inline

In [2]:
dicts = []
with codecs.open('/home/agrigorev/git-projects/allen-qa/lucene-features-2.json', 'r', 'utf-8') as f:
    for line in f:
        dicts.append(json.loads(line))


In [3]:
questions = pd.DataFrame(dicts)

In [4]:
def extract_doc_features(docs):
    top = docs[:3]
    names = [d['title'] for d in top]
    all_scores = [d['score'] for d in docs]
    if not all_scores:
        all_scores = [0]

    top_scores = [d['score'] for d in top]
    if not top_scores:
        top_scores = [0]

    median_score = np.median(all_scores)
    mean_score = np.mean(all_scores)
    min_score = np.min(all_scores)

    score_1 = np.max(top_scores)
    score_2 = np.median(top_scores)
    score_3 = np.min(top_scores)

    return pd.Series((names, median_score, mean_score, min_score, score_1, score_2, score_3))

def names(pref):
    res = ['names', 'median_score', 'mean_score', 'min_score', 'score_1', 'score_2', 'score_3']
    return [pref + '_' + n for n in res]

In [10]:
questions.columns

Index([u'answer', u'answerLetter', u'ck12EbookAnswerResult', u'ck12EbookBothQADocCount', u'ck12EbookBothQADocCountAMustHave', u'ck12EbookKendallTauCorr', u'ck12EbookQuestionResult', u'ck12EbookSpearmanCorr', u'ck12WikiAnswerResult', u'ck12WikiBothQADocCount', u'ck12WikiBothQADocCountAMustHave', u'ck12WikiKendallTauCorr', u'ck12WikiQuestionResult', u'ck12WikiSpearmanCorr', u'label', u'ngramsAnswer', u'ngramsCk12EbookAnswerResult', u'ngramsCk12EbookBothQADocCount', u'ngramsCk12EbookBothQADocCountAMustHave', u'ngramsCk12EbookKendallTauCorr', u'ngramsCk12EbookQuestionResult', u'ngramsCk12EbookSpearmanCorr', u'ngramsCk12WikiAnswerResult', u'ngramsCk12WikiBothQADocCount', u'ngramsCk12WikiBothQADocCountAMustHave', u'ngramsCk12WikiKendallTauCorr', u'ngramsCk12WikiQuestionResult', u'ngramsCk12WikiSpearmanCorr', u'ngramsQuestion', u'question', u'questionId', u'rawAnswer', u'rawQuestion', u'source', u'type', u'word2vecAnswer', u'word2vecCosine', u'word2vecMissing', u'word2vecQuestion', u'ck_12_na

In [7]:
questions[names('ck_12_answer')] = questions.ck12EbookAnswerResult.apply(extract_doc_features)
questions[names('ck_12_question')] = questions.ck12EbookQuestionResult.apply(extract_doc_features)

In [9]:
questions[names('wiki_answer')] = questions.ck12WikiAnswerResult.apply(extract_doc_features)
questions[names('wiki_question')] = questions.ck12WikiQuestionResult.apply(extract_doc_features)

In [70]:
questions.word2vecMissing

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
...
42513    False
42514    False
42515    False
42516    False
42517    False
42518    False
42519    False
42520    False
42521    False
42522    False
42523    False
42524    False
42525    False
42526    False
42527    False
Name: word2vecMissing, Length: 42528, dtype: bool

In [77]:
def NaNs_to_zeros(vec):
    if np.isnan(vec).all():
        return np.zeros_like(vec)
    else:
        return vec
    

In [42]:
questions.word2vecAnswer = questions.word2vecAnswer.apply(lambda x: np.array(x).astype('float'))
questions.word2vecQuestion = questions.word2vecQuestion.apply(lambda x: np.array(x).astype('float'))
questions.word2vecCosine = questions.word2vecCosine.astype('float')

In [79]:
questions.word2vecAnswer = questions.word2vecAnswer.apply(NaNs_to_zeros)
questions.word2vecQuestion = questions.word2vecQuestion.apply(NaNs_to_zeros)

In [14]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from scipy import sparse
from collections import Counter

In [80]:
train = questions[questions.source == 'TRAIN']

In [18]:
train_f1 = list(train.apply(lambda x:'%s' % (x['question']),axis=1))
train_f2 = list(train.apply(lambda x:'%s' % (x['answer']),axis=1))
train_f3 = list(train.apply(lambda x:'%s' % (x['type']),axis=1))

In [54]:
tfv1 = TfidfVectorizer(input=u'content', encoding=u'utf-8', decode_error=u'strict', 
                       strip_accents=None, lowercase=True, preprocessor=None, 
                       tokenizer=None, analyzer=u'word', stop_words=None, 
                       token_pattern=u'(?u)\\b\\w\\w+\\b', ngram_range=(1, 2), max_df=1.0, min_df=1)
train_f1 = tfv1.fit_transform(train_f1)

tfv2 = TfidfVectorizer(input=u'content', encoding=u'utf-8', decode_error=u'strict', 
                       strip_accents=None, lowercase=True, preprocessor=None, 
                       tokenizer=None, analyzer=u'word', stop_words=None, 
                       token_pattern=u'(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), max_df=1.0, min_df=1)
train_f2 = tfv2.fit_transform(train_f2)

tfv3 = TfidfVectorizer(input=u'content', encoding=u'utf-8', decode_error=u'strict', 
                       strip_accents=None, lowercase=True, preprocessor=None, 
                       tokenizer=None, analyzer=u'word', stop_words=None, 
                       token_pattern=u'(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), max_df=1.0, min_df=1)
train_f3 = tfv3.fit_transform(train_f3)



In [109]:
train.word2vecCosine[train.word2vecMissing] = 0.0

In [110]:
X = np.column_stack((np.vstack(train.word2vecQuestion), 
                     np.vstack(train.word2vecAnswer),
                     train.word2vecCosine))

In [112]:
X = sparse.hstack((X, train_f1, train_f2, train_f3))

In [113]:
X

<10000x25911 sparse matrix of type '<type 'numpy.float64'>'
	with 4203613 stored elements in COOrdinate format>

In [114]:
y = (train.labela == 'true').astype(int)

AttributeError: 'DataFrame' object has no attribute 'labela'

In [115]:
xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(X, y, stratify=y)

In [116]:
clf = RandomForestClassifier(n_jobs=-1, n_estimators=100)
clf.fit(xtrain, ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [117]:
roc_auc_score(ytest, clf.predict_proba(xtest)[:,0])

0.66734293333333339