## Это код для подсчета различных статистик, связанных с идентификацией ключевых для различных учебных потоков слов.

In [None]:
import os

txtsDir = '../../txt_cmc_diploms/3/'
txts = [filename for filename in os.listdir(txtsDir) if filename != 'broken_encoding']

print(txts)

### Частота встречаемости слова в корпусе работ, соотвествующих учебному потоку.

In [None]:
from __future__ import unicode_literals
from pyrutok import Token, Sentence, Tokenizer, GraphemTag
from pymorphy2 import MorphAnalyzer
from nltk.corpus import stopwords
from collections import Counter

morphAnalyzer = MorphAnalyzer()
corpusWords = Counter()

for txtFilename in txts:
    with open(txtsDir+txtFilename, 'r') as txtFile:
        content = txtFile.read()
        
        for sentence in Tokenizer(content):
            for token in sentence:
                if GraphemTag.contains(token.get_graphem_tag(), GraphemTag.CYRILLIC):
                    word = morphAnalyzer.parse(token.get_escaped_data())[0].normal_form
                    if word not in stopwords.words('russian'):
                        corpusWords[word] += 1
        
print(corpusWords.most_common(200))

In [None]:
wordCount = 0
for word in corpusWords:
    wordCount += corpusWords[word]

for word in corpusWords.most_common(200):
    print(word[1], '/', wordCount, sep='')

### Факт использования слова в корпусе работ, соотвествующих учебному потоку.

In [None]:
from __future__ import unicode_literals
from pyrutok import Token, Sentence, Tokenizer, GraphemTag
from pymorphy2 import MorphAnalyzer
from nltk.corpus import stopwords
from collections import Counter

morphAnalyzer = MorphAnalyzer()
corpusWords = []

for txtFilename in txts:
    with open(txtsDir+txtFilename, 'r') as txtFile:
        content = txtFile.read()
        words = []
        
        for sentence in Tokenizer(content):
            for token in sentence:
                if GraphemTag.contains(token.get_graphem_tag(), GraphemTag.CYRILLIC):
                    word = morphAnalyzer.parse(token.get_escaped_data())[0].normal_form
                    if word not in stopwords.words('russian'):
                        words.append(word)
        corpusWords = corpusWords + list(set(words))
        
print(Counter(corpusWords).most_common(200))

In [None]:
for word in Counter(corpusWords).most_common(200):
    print(word[1], '/', 124, sep='')

### Term frequency-inverse document frequency (TF-IDF) слова в корпусе работ, соотвествующих учебному потоку.

<ins>Примечание</ins>: *это способ оценить важность термина для какого-либо документа относительно всех остальных документов. Принцип такой — если слово встречается в каком-либо документе часто, при этом встречаясь редко во всех остальных документах — это слово имеет большую значимость для того самого документа.*

In [None]:
from __future__ import unicode_literals
from pyrutok import Token, Sentence, Tokenizer, GraphemTag
from pymorphy2 import MorphAnalyzer
from nltk.corpus import stopwords
from collections import Counter
import math

corpusTf = []
corpusWords = set()

for txtFilename in txts:
    documentWords = Counter()
    with open(txtsDir+txtFilename, 'r') as txtFile:
        contentString = txtFile.read()
        contentList = []
        for sentence in Tokenizer(contentString):
            for token in sentence:
                if GraphemTag.contains(token.get_graphem_tag(), GraphemTag.CYRILLIC):
                    word = morphAnalyzer.parse(token.get_escaped_data())[0].normal_form
                    if word not in stopwords.words('russian'):
                        contentList.append(word)
                        
        contentTf = Counter(contentList)
        for word in contentTf:
            contentTf[word] = contentTf[word] / float(len(contentList))
            corpusWords.add(word)
        
        corpusTf.append(contentTf)
      
corpusIdf = {}
for word in corpusWords:
    corpusIdf[word] = math.log10(len(corpusTf) / sum([1 for document in corpusTf if word in document]))
           
for document in corpusTf:
    for word in document:
        document[word] = document[word] * corpusIdf[word]

In [None]:
from pprint import pprint
        
pprint(corpusTf)

### Это код для извлечения набора наиболее репрезентативных для выпускных работ различных учебных потоков слов.

Метод взят из статьи *The automated acquisition of topic signatures for text summarization*.

In [None]:
import os

relevantTxtsDir = '../../txt_cmc_diploms/2/'
nonRelevantTxtsDirs = ['../../txt_cmc_diploms/1/', '../../txt_cmc_diploms/3/']

relevantTxts = [relevantTxtsDir + filename for filename in os.listdir(relevantTxtsDir) if filename != 'broken_encoding']
nonRelevantTxts = []
for nonRelevantTxtsDir in nonRelevantTxtsDirs:
    nonRelevantTxts += [nonRelevantTxtsDir + filename for filename in os.listdir(nonRelevantTxtsDir) if filename != 'broken_encoding']

print(relevantTxts)
print(nonRelevantTxts)

In [None]:
from __future__ import unicode_literals
from pyrutok import Token, Sentence, Tokenizer, GraphemTag
from pymorphy2 import MorphAnalyzer
from nltk.corpus import stopwords
from collections import Counter
from pymorphy2 import MorphAnalyzer
    
morphAnalyzer = MorphAnalyzer()

# подсчет частоты встречаемости слов в работах одного учебного потока
relevantFreq = Counter()

for txtFilename in relevantTxts:
    with open(txtFilename, 'r') as txtFile:
        contentString = txtFile.read()
        for sentence in Tokenizer(contentString):
            for token in sentence:
                if GraphemTag.contains(token.get_graphem_tag(), GraphemTag.CYRILLIC):
                    word = morphAnalyzer.parse(token.get_escaped_data())[0].normal_form
                    if word not in stopwords.words('russian'):
                        relevantFreq[word] += 1

# подсчет частоты встречаемости слов в работах двух других учебных потоков
nonRelevantFreq = Counter()

for txtFilename in nonRelevantTxts:
    with open(txtFilename, 'r') as txtFile:
        contentString = txtFile.read()
        for sentence in Tokenizer(contentString):
            for token in sentence:
                if GraphemTag.contains(token.get_graphem_tag(), GraphemTag.CYRILLIC):
                    word = morphAnalyzer.parse(token.get_escaped_data())[0].normal_form
                    if word not in stopwords.words('russian'):
                        nonRelevantFreq[word] += 1

# подсчет частоты встречаемости остальных слов в работах одного учебного потока
notRelevantFreq = Counter()

for word in relevantFreq:
    notRelevantFreq[word] = sum([relevantFreq[notWordFreq] for notWordFreq in relevantFreq if notWordFreq != word])

# подсчет частоты встречаемости остальных слов в работах двух других учебного потока
notNonRelevantFreq = Counter()

for word in nonRelevantFreq:
    notNonRelevantFreq[word] = sum([nonRelevantFreq[notWordFreq] for notWordFreq in nonRelevantFreq if notWordFreq != word])

In [None]:
from __future__ import unicode_literals
from pyrutok import Token, Sentence, Tokenizer, GraphemTag
from pymorphy2 import MorphAnalyzer
from nltk.corpus import stopwords
from collections import Counter
from pymorphy2 import MorphAnalyzer
    
morphAnalyzer = MorphAnalyzer()

relevantTxtsWithWord = {}
for requiredWord in relevantFreq:
    relevantTxtsWithWord[requiredWord] = Counter()

# количество релевантных документов с рассматриваемым словом
for txtFilename in relevantTxts:
    with open(txtFilename, 'r') as txtFile:
        contentString = txtFile.read()
        for sentence in Tokenizer(contentString):
            for token in sentence:
                if GraphemTag.contains(token.get_graphem_tag(), GraphemTag.CYRILLIC):
                    word = morphAnalyzer.parse(token.get_escaped_data())[0].normal_form
                    if word not in stopwords.words('russian') and word in relevantFreq:
                        relevantTxtsWithWord[word][txtFilename] = 1

nonRelevantTxtsWithWord = {}
for requiredWord in relevantFreq:
    nonRelevantTxtsWithWord[requiredWord] = Counter()
    
# количество нерелевантных документов с рассматриваемым словом
for txtFilename in nonRelevantTxts:
    with open(txtFilename, 'r') as txtFile:
        contentString = txtFile.read()
        for sentence in Tokenizer(contentString):
            for token in sentence:
                if GraphemTag.contains(token.get_graphem_tag(), GraphemTag.CYRILLIC):
                    word = morphAnalyzer.parse(token.get_escaped_data())[0].normal_form
                    if word not in stopwords.words('russian') and word in relevantFreq:
                        nonRelevantTxtsWithWord[word][txtFilename] = 1

In [None]:
relevantFreqNormalized = Counter()
relevantFreqWordsCount = sum([relevantFreq[word] for word in relevantFreq])
for requiredWord in relevantFreq:
    relevantFreqNormalized[requiredWord] = relevantFreq[requiredWord] / relevantFreqWordsCount
    
nonRelevantFreqNormalized = Counter()
nonRelevantFreqWordsCount = sum([nonRelevantFreq[word] for word in nonRelevantFreq])
for requiredWord in nonRelevantFreq:
    nonRelevantFreqNormalized[requiredWord] = nonRelevantFreq[requiredWord] / nonRelevantFreqWordsCount
    
notRelevantFreqNormalized = Counter()
notRelevantFreqWordsCount = sum([relevantFreq[word] for word in relevantFreq])
for requiredWord in relevantFreq:
    notRelevantFreqNormalized[requiredWord] = notRelevantFreq[requiredWord] / notRelevantFreqWordsCount
    
notNonRelevantFreqNormalized = Counter()
notNonRelevantFreqWordsCount = sum([nonRelevantFreq[word] for word in nonRelevantFreq])
for requiredWord in nonRelevantFreq:
    notNonRelevantFreqNormalized[requiredWord] = notNonRelevantFreq[requiredWord] / notNonRelevantFreqWordsCount

In [None]:
import math
import scipy.special
from decimal import Decimal
    
def binomialDistribution(k, n, x):
    return Decimal(scipy.special.binom(n, k)) * pow(x, Decimal(k)) * pow(Decimal(1) - x, Decimal(n - k))

# рассчет весов в соответствии со статьей
termsWeightEstimation = Counter()
for requiredWord in relevantFreq:
    o11 = relevantFreqNormalized[requiredWord]
    o12 = nonRelevantFreqNormalized[requiredWord] if requiredWord in nonRelevantFreqNormalized else 1
    o21 = notRelevantFreqNormalized[requiredWord]
    o22 = notNonRelevantFreqNormalized[requiredWord] if requiredWord in notNonRelevantFreqNormalized else 1
    p = Decimal(len(relevantTxts) / (len(relevantTxts) + len(nonRelevantTxts)))
    
    p1 = Decimal(len(relevantTxtsWithWord[requiredWord]) / (len(relevantTxtsWithWord[requiredWord]) + len(nonRelevantTxtsWithWord[requiredWord]) + 1))
    p2 = Decimal((len(nonRelevantTxtsWithWord[requiredWord]) + 1) / (len(relevantTxtsWithWord[requiredWord]) + len(nonRelevantTxtsWithWord[requiredWord]) + 1))

    # рассчет функции правдоподобия первой гипотезы
    likelihoodH1 = binomialDistribution(o11, o11+o12, p) * binomialDistribution(o21, o21+o22, p)

    # рассчет функции правдоподобия второй гипотезы
    likelihoodH2 = binomialDistribution(o11, o11+o12, p1) * binomialDistribution(o21, o21+o22, p2)
    
#     if likelihoodH2 == 0:
#         print(requiredWord, o11, o12, o21, o22, p, p1, p2)
#         continue
    
    termsWeightEstimation[requiredWord] = -2 * (likelihoodH1 / likelihoodH2).log10()

In [None]:
from pprint import pprint

pprint(termsWeightEstimation.most_common(30))