In [243]:
import os
import json

import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.base import TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline

In [190]:
input_path = 'data/python_tweets.json'
label_path = 'data/python_classes.json'

In [197]:
tweets = []

with open(input_path) as f:
    for line in f:
        if len(line.strip()) == 0:
            continue
        tweets.append(json.loads(line)) # For marking the data, remove the access to 'text'.

In [192]:
len(tweets)

150

In [193]:
labels = []

if os.path.exists(label_path):
    with open(label_path) as f:
        labels = json.load(f)

In [194]:
len(labels)

150

In [195]:
def get_next_tweet():
    return tweets[len(labels)]['text']

In [196]:
get_next_tweet()

IndexError: list index out of range

In [None]:
%%javascript

function handle_output(out) {
    $('div#tweet_text').html(out);
}

function execute_python(python_method, callback) {
    var kernel = IPython.notebook.kernel;
    var callbacks = {
        iopub: {
            output: (out) => callback(out.content.text.trim())
        }
    };
    kernel.execute(python_method, callbacks, {silent: false});
}

function load_next_tweet() {
    execute_python('get_next_tweet()', handle_output)
}

function set_label(label) {
    var kernel = IPython.notebook.kernel;
    kernel.execute('labels.append(' + label + ')');
    load_next_tweet();
}

In [None]:
%%html

<div name="tweetbox">
    Instructions: Click in textbox. Enter a 1 if the tweet is relevant, enter 0 otherwise.
    <br> Tweet: 
    <div id="tweet_text" value="text"></div>
    <br> 
    <input type=text id="capture"></input>
    <br> 
</div>

<script>
    $('input#capture').keypress(function(e) {
        // Number 0.
        if (e.which === 48) {
            set_label(0)
            $('input#capture').val('')
        } else if (e.which === 49) {
        // Number 1.
            set_label(1)
            $('input#capture').val('')
        }
    });
    load_next_tweet();
</script>

In [None]:
len(labels)

In [None]:
if os.path.exists(label_path):
    print(f'{label_path} already exists')
    pass
else:
    with open(label_path, 'w') as f:
        json.dumps(labels, f)

In [None]:
# %tkinter inline
# import tkinter

# window = tkinter.Tk()
# # to rename the title of the window
# window.title("GUI")
# # pack is used to show the object in the window
# label = tkinter.Label(window, text = "Hello World!").pack()
# window.mainloop()

In [246]:
stop_words = set(stopwords.words('english'))
class NltkBow(TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return [{word: True 
                 for word in word_tokenize(document)
                 if len(word) > 3
                 and word not in stop_words}
                for document in X]

In [247]:
pipeline = Pipeline([
    ('bow', NltkBow()),
    ('vectorizer', DictVectorizer()),
#     ('vectorizer2', TfidfVectorizer(lowercase=True)),
    ('classifier', BernoulliNB())
])

In [248]:
scores = cross_val_score(pipeline, [tweet['text'] for tweet in tweets], labels, scoring='f1', cv=5)
np.mean(scores)

0.8591436633541896

In [249]:
model = pipeline.fit([tweet['text'] for tweet in tweets], labels)
clf = model.named_steps['classifier']

In [250]:
feature_log_probabilities = clf.feature_log_prob_
top_features = np.argsort(-feature_log_probabilities[1])[:50]
top_features

array([310, 193, 385, 211,  98, 369, 368, 367, 347,  93, 317, 312, 107,
       283,   0, 195,   4, 237, 153, 172,  94, 327, 330, 332, 334, 229,
        71,  67, 364, 214, 241, 245, 203, 249, 275, 141, 276, 137, 280,
       252, 131, 286, 128, 379, 118, 110, 251, 105, 104, 253])

In [251]:
vectorizer = model.named_steps['vectorizer']

In [252]:
for i, feature_index in enumerate(top_features):
    print(f'{i:2d}', 
          '{:20s}'.format(vectorizer.feature_names_[feature_index]), 
          '{:.3f}'.format(np.exp(feature_log_probabilities[1][feature_index])))

 0 https                0.663
 1 Python               0.564
 2 python               0.277
 3 Science              0.089
 4 Data                 0.089
 5 pathlib.Path.mkdir   0.069
 6 pathlib.Path         0.069
 7 pathlib              0.069
 8 method               0.069
 9 Create               0.069
10 import               0.069
11 htt…                 0.069
12 Developer            0.069
13 directories          0.069
14 '/my/new/directory   0.069
15 PythonWeekly         0.069
16 .mkdir               0.069
17 With                 0.059
18 Learning             0.050
19 Movie-Ratings        0.050
20 Credit-Card          0.050
21 kdnuggets            0.050
22 language             0.050
23 learn                0.050
24 libsvm               0.050
25 Transactions         0.050
26 Artificial           0.050
27 Analysis             0.050
28 parents=Tr…          0.050
29 Server               0.050
30 accelerated          0.050
31 answer               0.050
32 Reinforcement        0.050
33 author 