In [2]:
import lxml.html as html
import re
import time
import io
import nltk
import math
import string
from collections import defaultdict

## News Parsing

In [3]:
topics = ['politics', 'sport']
news_url = {'http://izvestia.ru/archive/15?type=1&p=':'politics',  'http://izvestia.ru/archive/21?type=1&p=':'sport'}
basic_url = 'http://izvestia.ru'

Получаем ссылки на новости

In [4]:
news_urls = defaultdict(list)
for url in news_url:
    for i in range(1,150):
        news = url + str(i)
        news_html = html.parse(news)
        found_urls = news_html.getroot().xpath('//div[@class="items_list_text"]/h3/a')
        for page_url in found_urls:
            href = page_url.get('href')
            news_urls[news_url[url]].append(basic_url + href)

In [5]:
#urls count
print(len(news_urls['politics']))
print(len(news_urls['sport']))

1463
1476


Парсим тексты новостей

In [6]:
news_corpus = defaultdict(list)
for topic in topics:
    count = 0
    for news_page in  news_urls[topic]:
        page_html = html.parse(news_page)
        text = page_html.getroot().xpath('//div[@class="text_block"]/p/text()')
        head = page_html.getroot().xpath('//h1[@itemprop="headline"]/text()')
        if not text:
            continue
        news_text = ''.join(head) + ''.join(text)
        news_corpus[topic].append(news_text)
        count += 1
        print("Topic: %s Parsing progress: %d out of %d " % (topic, count, len(news_urls[topic])), end = '\r' )    



Размер корпуса

In [7]:
print(len(news_corpus['politics']))
print(len(news_corpus['sport']))

1411
1079


Считываем биграммы и триграммы

In [8]:
def load_ngrams(input_file):
    input_f = io.open(input_file, "r", encoding='utf8')
    n_dict = {}
    for line in input_f:
        line = line.replace('\n', '').split('\t')
        n_dict[tuple(line[0:])] = int(line[0])
    input_f.close()
    return n_dict

In [9]:
bigrams = load_ngrams('2grams-3.txt')
print('2grams count: ', len(bigrams.keys()))

threegrams = load_ngrams('3grams-3.txt')
print('3grams count: ', len(threegrams.keys()))

2grams count:  6750525
3grams count:  4655170


Токенизируем корпус

In [10]:
tokenized_corpus = defaultdict(list)
for key in news_corpus.keys():
    for text in news_corpus[key]:
        text_str = ''.join(text)
        line = nltk.word_tokenize(text_str)
        line = [word.encode("utf-8").decode("utf-8").lower() for word in line if word not in string.punctuation
                        and word not in [u'\u2012', u'\u2013', u'\u2014', u'\u2015', u'``', u"''"]]
        tokenized_corpus[key] += line

In [14]:
for key in tokenized_corpus.keys():
    print(key, len(tokenized_corpus[key]))

politics 740534
sport 463327


Находим перплексию

In [15]:
def find_perplexity(category, n3_gram, n2_gram):
    sum_probability = 0
    unknown_words = 0
    min_probability = 1
    sum_log = 0
    
    current_probability= {}
    
    for i in range(2, len(category)):
        try:
            current_probability[i] = (n3_gram[(category[i-2], category[i-1], category[i])] + 0.0) 
            / n2_gram[(category[i-2],category[i-1])]
            sum_probability += current_probability
            if current_probability[i] < min_probability:
                min_probability = current_probability[i]
        except KeyError:
            unknown_words +=1
    print (unknown_words, len(category), sum_probability)
    known_words = len(category) - unknown_words
    min_probability /= 10.0
    probability_to_substract = (unknown_words * min_probability + 0.0) / known_words
    for i in range(2, len(category)):
        try:
            sum_log += math.log(current_probability[i] - probability_to_substract, 2)
        except KeyError:
            sum_log += math.log(min_probability, 2)

    return sum_log

In [20]:
for key in tokenized_corpus.keys():
    perplexity = find_perplexity(tokenized_corpus[key], bigrams, threegrams)
    perplexity = 2**(-perplexity / len(tokenized_corpus[key]))
    print(key,' perplexity: ', perplexity, '\n')

740532 740534 0
politics  perplexity:  9.999937813028556 

463325 463327 0
sport  perplexity:  9.999900606772984 



Вероятностностная модель корпуса категории новостей "Спорт" обладает лучшей предсказательной способностью, чем модель корпуса категории "Политика".