In [163]:
import twitter
import os
import json

from sklearn.base import TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import numpy as np

In [2]:
def load_config(input_file):
    with open(input_file, "r") as f:
        return json.load(f)

In [3]:
# Load the credentials from config file
config = load_config('config.json')

In [4]:
authorization = twitter.OAuth(config['access_token'], 
                              config['access_token_secret'],
                              config['consumer_key'],
                              config['consumer_secret'])

In [5]:
t = twitter.Twitter(auth=authorization)

In [6]:
def write_search_tweet(query, output_filename):
    '''
    A function that search for 100 tweets of a particular
    keyword and writes it to json
    '''
    if not os.path.exists(output_filename):
        with open(output_filename, 'a') as output_file:
            print('writing to file')
            search_results = t.search.tweets(q=query, count=100)['statuses']
            for tweet in search_results:
                if 'text' in tweet:
                    output_file.write(json.dumps(tweet))
                    output_file.write('\n\n')

output_filename = 'data/python_tweets.json'
search_query = 'python'

write_search_tweet(search_query, output_filename)

In [8]:
def parse_json(input_filename):
    '''
    A function that loads a file and parse the input
    as json
    '''
    tweets = []
    with open(input_filename) as f:
        for line in f:
            if len(line.strip()) == 0:
                continue
            tweets.append(json.loads(line))
    return tweets

In [57]:
# Loading and classifying the dataset
input_filename = 'data/python_tweets.json'
tweets = parse_json(input_filename)
len(tweets)

150

In [46]:
def load_labels(input_filename):
    labels = []
    if os.path.exists(input_filename):
        with open(input_filename) as f:
            labels = json.load(f)
    return labels

In [81]:
labels_filename = 'data/python_classes.json'
labels = load_labels(labels_filename)
if labels:
    print(len(labels))

[0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1]
150


In [49]:
def get_next_tweet():
    return tweets[len(labels)]['text']

In [50]:
%%js

function load_next_tweet() {
    var code_input = 'get_next_tweet()';
    var kernel = IPython.notebook.kernel;
    var callbacks = { 'iopub': {'output': handle_output }};
    kernel.execute(code_input, callbacks, {silent: false});
}

function set_label(label) {
    var kernel = IPython.notebook.kernel
    kernel.execute('labels.append(' + label + ')')
    load_next_tweet();
}

function handle_output(out) {
    var res = out.content.data['text/plain'];
    $('#tweet_text').html(res);
}

<IPython.core.display.Javascript object>

In [51]:
%%html

<div name='tweetbox'>
    Instructions: Click in textbox. Enter a 1 if the tweet is relevant, enter 0 otherwise.<br>
    Tweet: 
    <div id='tweet_text' value='text'>Weird</div><br>
    <input type='text' id='capture'></input><br>
</div>

<script>

$('#capture').keypress(function (e) {
    if (e.which === 48) {
        set_label(0)
        $('#capture').val('')
    } else if (e.which === 49) {
        set_label(1)
        $('#capture').val('')
    }
});

load_next_tweet();
</script>

In [74]:
print(len(labels))

# Write the labels to the file
def write_labels(input_filename):
    with open(input_filename, 'w') as outf:
        json.dump(labels, outf)

# write_labels(labels_filename)

151


In [170]:
stop_words = set(stopwords.words('english'))
class NLTK_Bow(TransformerMixin):
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        return [{ word.lower(): True for word in word_tokenize(document) 
                if len(word) > 1 and word not in stop_words} 
                for document in X]

In [172]:
pipeline = Pipeline([('bag-of-words', NLTK_Bow()),
                     ('vectorizer', DictVectorizer()),
                     ('naive-bayes', BernoulliNB())
                    ])

In [173]:
def parse_json(input_filename):
    '''
    A function that loads a file and parse the input
    as json
    '''
    tweets = []
    with open(input_filename) as f:
        for line in f:
            if len(line.strip()) == 0:
                continue
            tweets.append(json.loads(line)['text'])
    return tweets

In [168]:
input_filename = 'data/python_tweets.json'
tweets = parse_json(input_filename)
tweets[0]

'RT @PyLadiesMadrid: ¿Quieres aprender Python desde cero? ¡Esta es tu oportunidad! \nEn PyLadies Madrid arrancamos el año con un nuevo tipo d…'

In [174]:
scores = cross_val_score(pipeline, tweets, labels, scoring='f1')
scores

array([ 0.82666667,  0.84057971,  0.82191781])

In [175]:
print('score is: {:.3f}'.format(np.mean(scores)))

score is: 0.830


In [176]:
model = pipeline.fit(tweets, labels)

In [177]:
# You can access individual steps through the named_steps attributes
nb = model.named_steps['naive-bayes']

In [178]:
feature_probabilities = nb.feature_log_prob_
top_features = np.argsort(-feature_probabilities[1])[:50]
top_features

array([378, 237, 408, 149, 412,   0, 151, 344, 248, 343, 101,  80, 140,
       239, 170, 342, 167, 163, 382, 326,   8, 304, 496,  85, 417, 282,
       276, 415, 315, 273, 271, 100, 193, 277, 153, 156,  73, 160, 380,
       367, 174, 158,  93,  81,  90, 399, 398,  99, 435, 267])

In [179]:
dv = model.named_steps['vectorizer']

In [181]:
for i, feature_index in enumerate(top_features):
    print(i + 1, dv.feature_names_[feature_index], np.exp(feature_probabilities[1][feature_index]))

1 python 0.762376237624
2 https 0.663366336634
3 rt 0.524752475248
4 data 0.108910891089
5 science 0.0891089108911
6 '/my/new/directory 0.0693069306931
7 datascience 0.0693069306931
8 pathlib.path.mkdir 0.0693069306931
9 import 0.0693069306931
10 pathlib.path 0.0693069306931
11 bigdata 0.0693069306931
12 amp 0.0693069306931
13 create 0.0693069306931
14 htt… 0.0693069306931
15 django 0.0693069306931
16 pathlib 0.0693069306931
17 directories 0.0693069306931
18 developer 0.0693069306931
19 pythonweekly 0.0693069306931
20 new 0.0693069306931
21 .mkdir 0.0693069306931
22 method 0.0693069306931
23 with 0.0594059405941
24 answer 0.049504950495
25 server 0.049504950495
26 libsvm 0.049504950495
27 learn 0.049504950495
28 scripting 0.049504950495
29 movie-ratings 0.049504950495
30 language 0.049504950495
31 know 0.049504950495
32 big 0.049504950495
33 exploratory 0.049504950495
34 learning 0.049504950495
35 days 0.049504950495
36 deep_learningz 0.049504950495
37 accelerated 0.049504950495
38 det