## PYTHON-PROJECT-2

---

Project 2 states the question: How to calculate term (word) importance in the text. In other words: the "contrast" of a word in a document (how much it stands out from other words).

For that basic text analysis metrics such as **tf-idf** is introduced.

**df** - document frequency

**idf** - inverse document frequency

$$ df = \frac{d}{N} $$

where

**N:** total number of documents in the corpus (e.g. chapters in book);

**d:** number of documents where the term t appears (e.g. in how many chapters the term t appears).

---

**tf** - term frequency

$$ tf = \frac{f}{M}$$

where

**f:** the raw count of a term in a document (e.g. how many times term t appears in chapter);

**M:** the raw count of all terms in a document (e.g. how many words in chapter).

---

In the implementation below instead of raw ``tf * idf`` value,
following formula is used: $$  \log{1+tf} * \log{idf)} $$

---

Task: Write code that will print the three words with the highest **tf-idf** value in the given target_chapter 
chapter in descending tf-idf order, separated by a space.

---

In [14]:
import math
import heapq

def read_data():
    data = open('war_peace_processed.txt', 'rt').read()
    return data.split('\n')

def create_dict(input_data):
    words_dict = {}
    for word in input_data:
        words_dict[word] = words_dict.get(word, 0) + 1
    return words_dict

def create_chapters_dicts(data):
    chapters_dicts = []
    chapters_lengths = []
    deliminators_indicis = [i for i, e in enumerate(data[1:]) if e == '[new chapter]']
    start = 0
    for index in deliminators_indicis:
        chapters_dicts.append(create_dict(data[start:index]))
        chapters_lengths.append(len(data[start:index]))
        start = index
    # add last chapter
    chapters_dicts.append(create_dict(data[start:]))
    chapters_lengths.append(len(data[start:]))
    return chapters_dicts, chapters_lengths

def get_term_frequency(chapters_dicts, chapters_lengths, target_word, target_chapter):
    return chapters_dicts[target_chapter].get(target_word, 0) / chapters_lengths[target_chapter]

def get_document_frequency(chapters_dict, target_word):
    frequency = 0
    for chapter_dict in chapters_dict:
        if target_word in chapter_dict:
            frequency += 1
    return frequency / len(chapters_dict)

def get_tf_idf(chapters_dicts, chapters_lengths, target_word, target_chapter):
    df = get_document_frequency(chapters_dicts, target_word)
    tf = get_term_frequency(chapters_dicts, chapters_lengths, target_word, target_chapter)
    idf = 1. / df
    return math.log(1 + tf) * math.log(idf)

def get_tf_idf_for_all(data, target_chapter, number):
    # Find start and end index for target chapter
    deliminators_indicis = [i for i, e in enumerate(data) if e == '[new chapter]']
    deliminators_indicis.insert(0, 0)
    chapter_start_idx = deliminators_indicis[target_chapter] + 1
    chapter_end_idx = deliminators_indicis[target_chapter + 1]
    # Create dict for every chapter
    chapters_dicts, chapters_lengths = create_chapters_dicts(data)
    # Calculate tf-idf for every word on chapter
    tf_idf_values = {}
    for target_word in chapters_dicts[target_chapter].keys():
        tf_idf_values[target_word] = get_tf_idf(chapters_dicts, chapters_lengths, target_word, target_chapter)
    return heapq.nlargest(number, tf_idf_values.items(), key=lambda item: item[1])

def print_words_with_highest_tf_idf(data, target_chapter_idx, number=3):
    words_with_highest_tf_idf = [result[0] for result in get_tf_idf_for_all(data, target_chapter_idx, number)] 
    print(*words_with_highest_tf_idf)

In [15]:
target_chapter_idx = 4
data = read_data()
print_words_with_highest_tf_idf(data, target_chapter_idx, number=3)

павловна анна тетушку
