In [None]:
import json
import numpy as np
import os
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
os.getcwd()

In [None]:
docs = []
for file in os.listdir("../data/universal-trending-20171129-1548/"):
    with open("../data/universal-trending-20171129-1548/" + file) as f:
        for line in f:
            docs.append(json.loads(line))
doc_index = dict(zip([doc["id"] for doc in docs], range(len(docs))))
doc_ids = [doc["id"] for doc in docs]
len(docs)

In [None]:
def makeSubjectToken(s):
    return "subject" + s.replace(" ", "").replace(",", "")
vectorizer = TfidfVectorizer(stop_words = 'english',
                         ngram_range = (1,2),
                         token_pattern = "[a-zA-Z]{2,}",
                         min_df = 100,
                         max_df = 0.1)
vectors = vectorizer.fit_transform([" ".join([doc["title"], doc["abstract"]])  for doc in docs])
vectors.shape

In [None]:
# for v in vectorizer.vocabulary_.keys():
#     if "subject" in v:
#         print(v)

In [None]:
sorted(zip(vectorizer.idf_, vectorizer.vocabulary_), key=lambda x: -x[0])[:10]

In [None]:
import random
DITHERING_EPSILON = 10
def dither(results):
    ranks = np.arange(len(results))
    ditherScores = np.log1p(ranks) + (np.log(DITHERING_EPSILON) * np.random.randn(len(results)))
    return list(np.array(results)[np.argsort(ditherScores)])

dither(list(enumerate(range(10))))

In [None]:
from __future__ import print_function
from ipywidgets import interact, fixed

model = SGDClassifier(loss="log")
alreadySeen = set()
# TODO: get batches
# TODO: explore/exploit tradeoff
# TODO: diterhing
sampleWeights = {
    0:0.01,
    1:1.0
}
exploreProbability = .33

def selectCandidate():
    if not alreadySeen:
#         print("--> Getting a random candidate")
        return random.choice(docs)
    scores = model.predict_proba(vectors)[:,1]
    if (random.random() < exploreProbability):
        print("--> explore uncertian examples")
        scoreFunc = lambda x: np.abs(0.5-x)
    else:
        print("--> exploit our best guess")
        scoreFunc = lambda x: -x
    rankedDocs = dither(sorted(zip(doc_ids, scores), key=lambda x: scoreFunc(x[1]))[:100])
    for docId, score in rankedDocs:
#         docId = result[0]
#         score = result[1]
        if docId not in alreadySeen:
            print("--> score %s" % score)
            return docs[doc_index[docId]]
    print("No unseen documents found")
    
def handleResponse(x, docId):
    if x != 0.5:
#         print(docId, ":", x)
        model.partial_fit(vectors[doc_index[docId]], np.array([x]),
                          classes=np.array([0,1]), sample_weight=np.array([sampleWeights[x]]))
        getFeedback()

def getFeedback():
    candidate = selectCandidate()
    print(candidate["title"])
    print(candidate["abstract"])
#     print(candidate["subjectArea"])
    alreadySeen.add(candidate["id"])
    interact(handleResponse, x=(0.0,1,0.5), docId=fixed(candidate["id"]));
getFeedback()

In [None]:
sorted_vocab = sorted(vectorizer.vocabulary_.items(), key=lambda x: x[1])

def top_features():
    return [(f[0][0], f[1]) for f in sorted(zip(sorted_vocab, model.coef_[0]), key=lambda x: -x[1])[:20]]

def bottom_features():
    return [(f[0][0], f[1]) for f in sorted(zip(sorted_vocab, model.coef_[0]), key=lambda x: x[1])[:20]]
for f in top_features():
    print(f)
for f in bottom_features():
    print(f)

In [None]:
print(top_features())
print(bottom_features())

In [None]:
def subject_features():
    return [(vocab[0], coef) for vocab, coef in zip(sorted_vocab, model.coef_[0]) if vocab[0].startswith("subject")]

In [None]:
subject_features()

In [None]:
selectCandidate()