In [39]:
import json
import numpy as np
import os
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
os.getcwd()

'/Users/pettitb/Dev/hackdays/tinder-style-recommender/notebooks'

In [40]:
docs = []
for file in os.listdir("../data/universal-trending-20171129-1548/"):
    with open("../data/universal-trending-20171129-1548/" + file) as f:
        for line in f:
            docs.append(json.loads(line))
doc_index = dict(zip([doc["id"] for doc in docs], range(len(docs))))
doc_ids = [doc["id"] for doc in docs]
len(docs)

273585

In [45]:
def makeSubjectToken(s):
    return "subject" + s.replace(" ", "").replace(",", "")
vectorizer = TfidfVectorizer(stop_words = 'english',
                         ngram_range = (1,2),
                         token_pattern = "[a-zA-Z]{2,}",
                         min_df = 100,
                         max_df = 0.1)
vectors = vectorizer.fit_transform([" ".join([doc["title"], doc["abstract"]])  for doc in docs])
vectors.shape

(273585, 33537)

In [46]:
# for v in vectorizer.vocabulary_.keys():
#     if "subject" in v:
#         print(v)

In [47]:
sorted(zip(vectorizer.idf_, vectorizer.vocabulary_), key=lambda x: -x[0])[:10]

[(8.9042507769894215, 'tests performed'),
 (8.9042507769894215, 'including participants'),
 (8.9042507769894215, 'based control'),
 (8.9042507769894215, 'numerical study'),
 (8.9042507769894215, 'mixed methods'),
 (8.9042507769894215, 'pools'),
 (8.9042507769894215, 'role self'),
 (8.9042507769894215, 'macroscale'),
 (8.9042507769894215, 'science policy'),
 (8.9042507769894215, 'archaeological')]

In [121]:
import random
DITHERING_EPSILON = 100
def dither(results):
    ranks = np.arange(len(results))
    ditherScores = np.log1p(ranks) + (np.log(DITHERING_EPSILON) * np.random.normal(len(results)))
    print(np.argsort(ditherScores))
    return list(np.array(results)[np.argsort(ditherScores)])

dither(range(10))

[0 1 2 3 4 5 6 7 8 9]


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [114]:
from __future__ import print_function
from ipywidgets import interact, fixed

model = SGDClassifier(loss="log")
alreadySeen = set()
# TODO: get batches
# TODO: explore/exploit tradeoff
# TODO: diterhing
sampleWeights = {
    0:0.01,
    1:1.0
}
exploreProbability = .33

def selectCandidate():
    if not alreadySeen:
#         print("--> Getting a random candidate")
        return random.choice(docs)
    scores = model.predict_proba(vectors)[:,1]
    if (random.random() < exploreProbability):
        print("--> explore uncertian examples")
        scoreFunc = lambda x: np.abs(0.5-x)
    else:
        print("--> exploit our best guess")
        scoreFunc = lambda x: -x
    rankedDocs = dither(sorted(zip(doc_ids, scores), key=lambda x: scoreFunc(x[1])))
    for docId, score in rankedDocs:
        if docId not in alreadySeen:
            print("--> score %.6f" % score)
            return docs[doc_index[docId]]
    print("No unseen documents found")
    
def handleResponse(x, docId):
    if x != 0.5:
#         print(docId, ":", x)
        model.partial_fit(vectors[doc_index[docId]], np.array([x]),
                          classes=np.array([0,1]), sample_weight=np.array([sampleWeights[x]]))
        getFeedback()

def getFeedback():
    candidate = selectCandidate()
    print(candidate["title"])
    print(candidate["abstract"])
#     print(candidate["subjectArea"])
    alreadySeen.add(candidate["id"])
    interact(handleResponse, x=(0.0,1,0.5), docId=fixed(candidate["id"]));
getFeedback()

Cyanobacteria and algae blooms: Review of health and environmental data from the harmful algal bloom-related illness surveillance system (HABISS) 2007–2011
Algae and cyanobacteria are present in all aquatic environments. We do not have a good sense of the extent of human and animal exposures to cyanobacteria or their toxins, nor do we understand the public health impacts from acute exposures associated with recreational activities or chronic exposures associated with drinking water. We describe the Harmful Algal Bloom-related Illness Surveillance System (HABISS) and summarize the collected reports describing bloom events and associated adverse human and animal health events. For the period of 2007–2011, Departments of Health and/or Environment from 11 states funded by the National Center for Environmental Health (NCEH), Centers for Disease Control and Prevention contributed reports for 4534 events. For 2007, states contributed 173 reports from historical data. The states participating 

In [56]:
sorted_vocab = sorted(vectorizer.vocabulary_.items(), key=lambda x: x[1])

def top_features():
    return [(f[0][0], f[1]) for f in sorted(zip(sorted_vocab, model.coef_[0]), key=lambda x: -x[1])[:20]]

def bottom_features():
    return [(f[0][0], f[1]) for f in sorted(zip(sorted_vocab, model.coef_[0]), key=lambda x: x[1])[:20]]
for f in top_features():
    print(f)
for f in bottom_features():
    print(f)

('innovative', 2.4075283082853125)
('leadership', 1.8250595564858823)
('ceos', 1.6824675795221748)
('behaviour', 1.4455119929203006)
('transactional', 1.2439652338065885)
('transformational', 1.1725944849550154)
('managers', 1.0802177708537488)
('founding', 0.86515121711602516)
('intervening', 0.80451741227399431)
('styles', 0.69591467646889527)
('climate', 0.66515008425351618)
('transformational leadership', 0.42090540933211318)
('mediates relationship', 0.41727783937249036)
('examine relationships', 0.41568784510436185)
('mediating effect', 0.41516881335257)
('develop test', 0.41314459562204564)
('equation model', 0.40408965824048754)
('findings contribute', 0.3951875694314424)
('ceo', 0.38673707871686158)
('relationship', 0.37935645837300747)
('crowding', -0.16579221788162998)
('visual', -0.032012902401800748)
('conservation', -0.026531380429607596)
('vision', -0.026226904701879925)
('zno', -0.025924545319670044)
('wires', -0.025891096403493359)
('object', -0.024904291976844423)
('f

In [None]:
print(top_features())
print(bottom_features())

In [None]:
def subject_features():
    return [(vocab[0], coef) for vocab, coef in zip(sorted_vocab, model.coef_[0]) if vocab[0].startswith("subject")]

In [None]:
subject_features()

In [17]:
selectCandidate()

--> Returning doc 64dc2349-0441-3fbe-b859-8cebf8e30365 with score 0.475345


{'abstract': 'In the ongoing debates about the role of immaterial labor in digital media economics, the work of feminist researchers into affective labor performed in the home— " women\'s work " —has barely featured. This article is an attempt to address this gap in the dominant framework for discussing consumer labor in digital contexts. It draws on feminist frameworks, particularly the work of Fortunati, in arguing that affective, immaterial labor has a variable and often indirect relationship to capitalist exchange. This indirect relationship allows the products of such work to retain their use-values while nevertheless remaining implicated in systems of exchange. This in turn draws attention to the immaterial product of reproductive labor, which is the social order itself, and the importance of the disciplining function of reproductive labor. The " spectral presence " (Staples 2007) of women and feminist thought haunts the growing range of theories and analyses of the free, unpaid,

SystemError: <built-in function where> returned a result with an error set