In [17]:
import spacy
from sklearn.base import TransformerMixin
nlp = spacy.load('en')
class BagOfWords(TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        results = []
        for document in X:
            row = {}
            for word in list(nlp(document)):
                if len(word.text.strip()):
                    row[word.text] = True
            results.append(row)
        return results

In [12]:
from sklearn.feature_extraction import DictVectorizer

In [3]:
from sklearn.naive_bayes import BernoulliNB

In [13]:
import os
input_filename = os.path.join(os.path.expanduser("~"), "Datamining/ch6/Data", "twitter", "python_tweets.json")
labels_filename = os.path.join(os.path.expanduser("~"), "Datamining/ch6/Data", "twitter", "python_classes.json")

In [18]:
import json
tweets = []
with open(input_filename) as inf:
    for line in inf:
        if len(line.strip()) == 0: continue
        tweets.append(json.loads(line)['text'])
with open(labels_filename) as inf:
    labels = json.load(inf)
tweets = tweets[:len(labels)]
assert len(tweets) == len(labels)

In [19]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('bag-of-words', BagOfWords()), ('vectorizer', DictVectorizer()), ('naive-bayes', BernoulliNB()) ])

In [20]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipeline, tweets, labels, scoring='f1')
import numpy as np
print("Score: {:.3f}".format(np.mean(scores)))



Score: 0.756


In [21]:
model = pipeline.fit(tweets, labels)

In [22]:
nb = model.named_steps['naive-bayes']
feature_probabilities = nb.feature_log_prob_

In [23]:
top_features = np.argsort(-nb.feature_log_prob_[1])[:50]

In [24]:
dv = model.named_steps['vectorizer']

In [25]:
for i, feature_index in enumerate(top_features):
    print(i, dv.feature_names_[feature_index], np.exp(feature_probabilities[1][feature_index]))

0 … 0.8235294117647056
1 : 0.5294117647058824
2 RT 0.47058823529411753
3 # 0.3529411764705882
4 Python 0.3529411764705882
5 . 0.3529411764705882
6 AI 0.2941176470588235
7 in 0.2941176470588235
8 IoT 0.2352941176470588
9 ; 0.2352941176470588
10 the 0.2352941176470588
11 BigData 0.2352941176470588
12 & 0.2352941176470588
13 Analytics 0.2352941176470588
14 a 0.2352941176470588
15 using 0.2352941176470588
16 Data 0.2352941176470588
17 and 0.2352941176470588
18 , 0.2352941176470588
19 of 0.2352941176470588
20 @gp_pulipaka 0.2352941176470588
21 ( 0.2352941176470588
22 Engineer 0.2352941176470588
23 DataScience 0.2352941176470588
24 MachineLearning 0.2352941176470588
25 Best 0.17647058823529413
26 for 0.17647058823529413
27 https://t.co/PNehSWqM0L 0.17647058823529413
28 Learning 0.17647058823529413
29 JavaのGUIウィジェットツールキットであるSwingやJython、C++で書かれており、プラグ 0.17647058823529413
30 Courses 0.17647058823529413
31 Deep 0.17647058823529413
32 Distilling 0.17647058823529413
33 IIo 0.17647058823529413
34 