Работа с ранее загруженными постами с использованием <br>Scink-learn
-------------------

In [90]:
# coding=utf-8
import numpy as np
import time

from elasticsearch import Elasticsearch
from sklearn.feature_extraction.text import CountVectorizer




Функция, чтобы засекать время выполнеия метода

In [91]:
def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
            
        print 'func: %r -> %2.2f sec' % \
              (method.__name__, te-ts)
            
        return result

    return timed

In [92]:
@timeit
def get_indeces_by_prefix(database, prefix):
    def filter_by_prefix(indeces):
        return filter(lambda x: prefix in x, indeces)
    
    def get_all_indeces():
        return database.indices.get_aliases().keys()
    
    return filter_by_prefix(get_all_indeces())

In [93]:
@timeit
def clean_documents(docs):
    def notNull(doc):
        return doc is not None
    
    def notEmpty(doc):
        return not (not doc)
    
    def condition(doc):
        return notNull(doc) and notEmpty(doc)
    
    return [d for d in docs if condition(d)]

In [94]:
@timeit
def get_docmunets_by_index(database, index):
    result = database.search(index=str(index),
                             size=5000,
                             body={"query": {"match_all": {}}})
    
    records = result['hits'].get('hits')
    documents = [r.get('_source').get('message') for r in records] 
    
    return documents

In [95]:
@timeit
def get_documents_by_indices(database, indices):
    documents = []
    
    for index in indices:
        documents.extend(get_docmunets_by_index(database, index))
        
    return documents

In [96]:
database = Elasticsearch()

In [97]:
indices = get_indeces_by_prefix(database, "fb_group_posts_")
indices

func: 'get_indeces_by_prefix' -> 0.00 sec


[u'fb_group_posts_newsone_122177661170978',
 u'fb_group_posts_vice_news_235852889908002',
 u'fb_group_posts_american_news_179035672287016',
 u'fb_group_posts_yahoo_news_338028696036',
 u'fb_group_posts_mtv_7245371700',
 u'fb_group_posts_citynews_toronto_32204506174',
 u'fb_group_posts_cnet_7155422274']

In [98]:
documents = get_documents_by_indices(database, indices)
documents = clean_documents(documents)
print "Получено документов:", len(documents)

func: 'get_docmunets_by_index' -> 0.07 sec
func: 'get_docmunets_by_index' -> 0.05 sec
func: 'get_docmunets_by_index' -> 0.05 sec
func: 'get_docmunets_by_index' -> 0.07 sec
func: 'get_docmunets_by_index' -> 0.04 sec
func: 'get_docmunets_by_index' -> 0.06 sec
func: 'get_docmunets_by_index' -> 0.05 sec
func: 'get_documents_by_indices' -> 0.39 sec
func: 'clean_documents' -> 0.02 sec
Получено документов: 15810


In [99]:
vectorizer = CountVectorizer()
vectorizer

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

Подсчитаем кол-во вхожений некоторых токенов в нашем корпусе

In [100]:
def get_words_count(token, vocabulary, x):
    index = vocabulary.get(token)
    
    if index is None:
        return 0
    
    return sum(x[:, index])[0, 0]

In [125]:
words = [
    "is", "the", "data", "man", "women", "computer", "police", "bus", "love", "me"
]

matrix = vectorizer.fit_transform(documents)

for token in words:
    print "{:8}".format(token), "->", get_words_count(token, vectorizer.vocabulary_, matrix)

is       -> 5281
the      -> 17127
data     -> 41
man      -> 520
women    -> 251
computer -> 36
police   -> 853
bus      -> 40
love     -> 308
me       -> 622
i        -> 0


Всего уникальных токенов в копусе

In [102]:
len(vectorizer.vocabulary_)

29600

# Использование Tfidf векторизатора

In [103]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer2 = TfidfVectorizer(ngram_range=(1, 2))
vectorizer2.fit_transform(documents)
vectorizer2

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

Кол-во токенов корпуса

In [105]:
len(vectorizer2.vocabulary_)

207401

In [110]:
words2 = [
    "are ok", "are", "ok", "classroom interface", "classroom", "interface"
]

matrix2 = vectorizer2.fit_transform(documents)

for token in words2:
    print "{:8}".format(token), "->", get_words_count(token, vectorizer2.vocabulary_, matrix2)

are ok   -> 0.243814053519
are      -> 126.778929193
ok       -> 2.01064107243
classroom interface -> 0.188727170574
classroom -> 0.929868587847
interface -> 0.858251103706


In [111]:
vectorizer2.vocabulary_

{u'scandalous photos': 152123,
 u'it absolutely': 93367,
 u'announcing who': 16704,
 u'russia syria': 150131,
 u'regard http': 145309,
 u'nine days': 120054,
 u'approach the': 17984,
 u'woods': 199775,
 u'her our': 80984,
 u'all working': 11086,
 u'obscene acts': 122878,
 u'comically': 42076,
 u'mitzie hunter': 113316,
 u'and artwork': 13806,
 u'taliyah was': 167529,
 u'sprague': 161672,
 u'sexual assault': 154923,
 u'changes in': 37684,
 u'or just': 128938,
 u'delegates platform': 50207,
 u'protest roots': 140717,
 u'fish bathe': 66233,
 u'all class': 10781,
 u'game customize': 71627,
 u'of justin': 124239,
 u'pythagorean theorem': 141825,
 u'caught clinton': 36620,
 u'magic show': 107385,
 u'bakirk\xf6y cevizlik': 23378,
 u'truly appreciate': 184199,
 u'trojan': 183949,
 u'education best': 57249,
 u'news could': 119221,
 u'touristes': 182753,
 u'lies her': 102108,
 u'jesus catches': 95082,
 u'dissed the': 53434,
 u'competition with': 42778,
 u'bringing': 31545,
 u'waters they': 19341