# Перплексия новостей политики и спорта

In [1]:
import nltk
import string
import math

### Формирование словарей частот 2 и 3-ргамм

In [90]:
grams_3 = {}
gr = open('3grams-3.txt')
for line in gr:
    line = line.split('\t')
    try:        
        grams_3[(line[1], line[3], line[5])] += int(line[0])
    except KeyError:
        grams_3[(line[1], line[3], line[5])] = int(line[0])
gr.close()
grams_2 = {}
gr = open('2grams-3.txt')
for line in gr:
    line = line.split('\t')
    try:        
        grams_2[(line[1], line[3])] += int(line[0])
    except KeyError:
        grams_2[(line[1], line[3])] = int(line[0])
gr.close()

### Чтение ранее распарсенных новостей (примерно по 1200 нововсти в каждой теме)

In [18]:
txt = open("texts_politics.txt")
politics = []
for line in txt:
    current_line = nltk.word_tokenize(line.decode("utf-8"))
    current_line = [word.encode("utf-8").decode("utf-8").lower() for word in current_line if word not in string.punctuation
                    and word not in [u'\u2012', u'\u2013', u'\u2014', u'\u2015', u'``', u"''"]]
    politics = politics + current_line
txt.close()

In [85]:
txt = open("texts_sport.txt")
sport = []
for line in txt:
    current_line = nltk.word_tokenize(line.decode("utf-8"))
    current_line = [word.encode("utf-8").decode("utf-8").lower() for word in current_line if word not in string.punctuation
                    and word not in [u'\u2012', u'\u2013', u'\u2014', u'\u2015', u'``', u"''"]]
    sport = sport + current_line
txt.close()

## Подсчет перплексии

#### Условная вероятность каждой отдельного слова при условии предыдущих вдух $p(w_i | w_{i-1}, w_{i-2})$ расчитывается как отношение 3-граммы $(w_i, w_{i-1}, w_{i-2})$ к 2-грамме $(w_{i-1}, w_{i-2})$, то есть $p(w_i | w_{i-1}, w_{i-2}) = \frac{f(w_i, w_{i-1}, w_{i-2})}{f(w_{i-1}, w_{i-2})}$, где $f$ -- частота. 
#### Перплексия отдельного корпуса рассчитывается по формуле $2^{-\frac{1}{N}\sum_{i = 2}^N\log_2p(w_i | w_{i-1}, w_{i-2})}$. (Пренебрегаем первыми двумя словами первого текста, а также границами между предложения и текстами.)
#### Условная вероятность неизвестной 3-раммы пологается равной одно десятой минимальной известной вероятности. Сумма значений условных вероятностей для неизвестных 3-грамм равномеоно "отбирается" от известных вероятностей.

In [87]:
sum_probability = 0
unknown_words = 0
min_probability = 1
sum_log = 0

for i in range(2, len(politics)):
    try:
        current_probability = (grams_3[(politics[i-2].encode("utf-8"), politics[i-1].encode("utf-8"), politics[i].encode("utf-8") + "\n")] + 0.0) / grams_2[
            (politics[i-2].encode("utf-8"), politics[i-1].encode("utf-8") + "\n")]
        sum_probability += current_probability
        if current_probability < min_probability:
            min_probability = current_probability
    except KeyError:
        unknown_words +=1

known_words = len(politics) - 2 - unknown_words
min_probability /= 10
probability_to_substract = (unknown_words * min_probability + 0.0) / known_words

for i in range(2, len(politics)):
    try:
        current_probability = (grams_3[(politics[i-2].encode("utf-8"), politics[i-1].encode("utf-8"), politics[i].encode("utf-8") + "\n")] + 0.0) / grams_2[
            (politics[i-2].encode("utf-8"), politics[i-1].encode("utf-8") + "\n")]
        sum_log += math.log(current_probability - probability_to_substract, 2)
    except KeyError:
        sum_log += math.log(min_probability, 2)
        
sum_log_politics = sum_log

In [88]:
sum_probability = 0
unknown_words = 0
min_probability = 1
sum_log = 0

for i in range(2, len(sport)):
    try:
        current_probability = (grams_3[(sport[i-2].encode("utf-8"), sport[i-1].encode("utf-8"), sport[i].encode("utf-8") + "\n")] + 0.0) / grams_2[
            (sport[i-2].encode("utf-8"), sport[i-1].encode("utf-8") + "\n")]
        sum_probability += current_probability
        if current_probability < min_probability:
            min_probability = current_probability
    except KeyError:
        unknown_words +=1

known_words = len(sport) - 2 - unknown_words
min_probability /= 10
probability_to_substract = (unknown_words * min_probability + 0.0) / known_words


for i in range(2, len(sport)):
    try:
        current_probability = (grams_3[(sport[i-2].encode("utf-8"), sport[i-1].encode("utf-8"), sport[i].encode("utf-8") + "\n")] + 0.0) / grams_2[
            (sport[i-2].encode("utf-8"), sport[i-1].encode("utf-8") + "\n")]
        sum_log += math.log(current_probability - probability_to_substract, 2)
    except KeyError:
        sum_log += math.log(min_probability, 2)
        
sum_log_sport = sum_log

In [97]:
print("Perplexity for politics news = ",2**(-sum_log_politics / len(politics)))
print("Perplexity for sport news = ",2**(-sum_log_sport / len(sport)))

('Perplexity for politics news = ', 102299.81971244352)
('Perplexity for sport news = ', 167476.07101836757)


# Вывод: Корпус "Политика" лучше соответствует построенной маковской модели языва 2-ого порядка, чем корпус "Спорт".