In [128]:
import io
import math
import nltk
import string
import lxml.html as html

In [116]:
# parse autonews.ru
def parse_autonews(pages):
    autonews = 'http://www.autonews.ru'
    for page in range(1, pages):

        # get urls for page
        main_page = html.parse('http://www.autonews.ru/news/list/' + str(page) + '/')
        urls = main_page.getroot().xpath('//ul[@class="news-list__list"]//a[@href]/@href')
        articles_list_autonews = []

        try:
            for url in urls:
                article = html.parse(autonews + url)

                # get text for each url and encode it
                texts = article.getroot().xpath('//*[contains(@class, "article__")]/text()')
                texts = [''.join(t).encode('utf-8', 'ignore').decode('utf-8') for t in texts]
                page_texts = ''.join(texts)

                # remove '\n' and spaces
                page_texts = page_texts.replace('\n', ' ').replace('\r', '')
                page_texts = " ".join(page_texts.split())
                articles_list_autonews.append(page_texts)
        except IOError:
            continue
    return articles_list_autonews

In [160]:
# parse ria.ru/science
def parse_science(date):
    science = 'http://ria.ru'
    for page in range(1, date):
        if page < 10:
            page = '0' + str(page)

        try:
            # get urls for page
            main_page_science = html.parse('http://ria.ru/science/201605' + str(page) + '/')
            urls = main_page_science.getroot().xpath('//div[@class="b-list"]//a[@href]/@href')
            articles_list_science = []
        
            for url in urls:
                article = html.parse(science + url)

                # get text for each url and encode it
                texts = article.getroot().xpath('//*[contains(@class, "b-article__body")]/p/text()')
                page_texts = ''.join(texts)

                # remove '\n' and spaces
                page_texts = page_texts.replace('\n', ' ').replace('\r', '')
                page_texts = " ".join(page_texts.split())
                articles_list_science.append(page_texts)
        except IOError:
            continue
    return articles_list_science

In [65]:
# read 2-grams and 3-grams
def load_ngrams(in_file):
    input_f = io.open(in_file, "r", encoding='utf8')
    n_dict = {}
    for line in input_f:
        line = line.replace('\n', '').split('\t')
        n_dict[tuple(line[0:])] = int(line[0])
    input_f.close()
    return n_dict

In [143]:
# split corpus on words
def tokenize(corpus):
    news_list = corpus
    words = []
    for text in news_list:
        text_str = ''.join(text)
        line = nltk.word_tokenize(text_str)
        line = [word.encode("utf-8").decode("utf-8").lower() for word in line if word not in string.punctuation
                        and word not in [u'\u2012', u'\u2013', u'\u2014', u'\u2015', u'``', u"''"]]
        words += line
    return words

In [151]:
def compute_perplexity(category, n3_gram, n2_gram):
    sum_probability = 0
    unknown_words = 0
    min_probability = 1
    sum_log = 0
    
    current_probability= {}
    
    for i in range(2, len(category)):
        try:
            current_probability[i] = (n3_gram[(category[i-2], category[i-1], category[i])] + 0.0) 
            / n2_gram[(category[i-2],category[i-1])]
            sum_probability += current_probability
            if current_probability[i] < min_probability:
                min_probability = current_probability[i]
        except KeyError:
            unknown_words +=1
    print (unknown_words, len(category), sum_probability)
    known_words = len(category) - unknown_words
    min_probability /= 10.0
    probability_to_substract = (unknown_words * min_probability + 0.0) / known_words
    for i in range(2, len(category)):
        try:
            sum_log += math.log(current_probability[i] - probability_to_substract, 2)
        except KeyError:
            sum_log += math.log(min_probability, 2)

    return sum_log

In [152]:
n2_gram = load_ngrams("2grams-3/2grams-3.txt")
print len(n2_gram.keys())

6750525


In [68]:
n3_gram = load_ngrams("3grams-3/3grams-3.txt")
print len(n3_gram.keys())

4655170


In [157]:
autonews = parse_autonews(20)

In [161]:
science = parse_science(20)

In [162]:
auto =  tokenize(autonews)
scn = tokenize(science)

In [163]:
auto_perplexity = compute_perplexity(auto, n3_gram, n2_gram)
print("Perplexity for autonews = ",2**(-sum_log_2 / len(auto)))

(5096, 5098, 0)
('Perplexity for autonews = ', 6.123155440306526)


In [164]:
scn_perplexity = compute_perplexity(scn, n3_gram, n2_gram)
print("Perplexity for science = ",2**(-sum_log_2 / len(scn)))

(4761, 4763, 0)
('Perplexity for science = ', 6.955466527532807)


Perplexity for autonews is less then perplexity for science. It means that autonews corpus adapted better then science corpus for current model.