# Задание

Помощник специалиста по найму персонала

* Если вы не знаете, что такое метрика TF-IDF - почитайте https://en.wikipedia.org/wiki/Tf%E2%80%93idf
* Напишите Python-скрипт, который:
    * прочитает файл http://www.lib.ru/POEEAST/GOMER/gomer01.txt (не обязательно из интернета, можно скачать руками и прочитать локально как файл)
    * выкинет все, что не является стихотворными строками (он может работать не идеально, что-то не выкидывать или выкидывать что-то лишнее, но чем выше точность, тем лучше)
    * откинет все, что не буквы и не проблемы - кроме дефисов, справа и слева от которых стоят буквы (т.е. '-' в слове 'когда-либо' удалять не нужно)
    * преобразует все слова в нижний регистр
    * посчитает метрики tf-df для всех встреченных слов, считая, что каждая песнь - это отдельный документ
    * сохранит посчитанные метрики в csv-файл с двумя столбцами - word и tfidf (можно использовать библиотеку Pandas, но не обязательно)


In [1]:
import pandas as pd
import math

with open('gomer01.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    
print(text[:500])

Гомер. Илиада



----------------------------------------------------------------------------
   (пер. с древнегреческого Н. Гнедича)
----------------------------------------------------------------------------

                                Л.И. Зайцев

ДРЕВНЕГРЕЧЕСКИЙ ГЕРОИЧЕСКИЙ ЭПОС И "ИЛИАДА" ГОМЕРА


     Как  мы  узнали  в  результате  многолетних раскопок, начатых в 1870 г.
Генрихом  Шлиманом  и  законченных  перед второй мировой войной американским
археологом  Блегеном, примерно пять 


In [2]:
# Делим тексты по слову ПЕСНЬ
songs = text.split("ПЕСНЬ ")
# Пропускаем первый текст, поскольку он является введением
songs = songs[1:]
# В 6-ой, 9-ой и 10-ой поэме дополнительно удаляем пункт ПРИМЕЧАНИЕ
chapters_with_additional_info = [5, 8, 9]
for chapter in chapters_with_additional_info:
    songs[chapter] = songs[chapter].split('ПРИМЕЧАНИЯ')[0]

In [3]:
# Функция для предобработки текста
def transform_text(text):
    good_symbols = ' -'
    # заменим перевод строки на пробел (так можно отделить порядковый номер главы от текста)
    text = text.replace('\n', ' ')
    # оставляем только буквы, пробелы и -
    text = ''.join([symb for symb in text if symb.isalpha() or symb in good_symbols])
    # удаляем слова, которые равные '-'
    # таким образом оставим - внутри слов
    tokens = [token for token in text.split() if token != '-' * len(token)]
    # есть слова, которые начинаются или заканчиваются на '-'
    # не ясно почему, в них удалим первый символ или последний символ соответственно
    tokens = [token[1:] if token.startswith('-') else token for token in tokens]
    tokens = [token[:-1] if token.endswith('-') else token for token in tokens]
    text = ' '.join(tokens)
    # приводим текст к нижнему регистру
    text = text.lower()
    # отделяем порядковый номер главы
    tokens = text.split()[1:]
    
    return ' '.join(tokens)

In [4]:
transformed_songs = [transform_text(song) for song in songs]

In [5]:
transformed_songs[8][:100]

'посольство так охраняли трояне свой стан но ахеян волнует ужас свыше ниспосланный бегства дрожащего '

### 1. Sklearn

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(transformed_songs)
tf_idf = tf_idf.toarray()
df = pd.DataFrame(tf_idf)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22125,22126,22127,22128,22129,22130,22131,22132,22133,22134
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009003,0.010336,0.0
1,0.0,0.02783,0.0,0.0,0.0,0.010482,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006758,0.007758,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.015317,0.0,0.0,0.017308,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.010613,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006842,0.0,0.0


In [7]:
df.columns = vectorizer.get_feature_names()
df = df.T
df.rename({i: 'Песнь {}'.format(i + 1) for i in range(len(df.columns))}, axis=1, inplace=True)
df.head()

Unnamed: 0,Песнь 1,Песнь 2,Песнь 3,Песнь 4,Песнь 5,Песнь 6,Песнь 7,Песнь 8,Песнь 9,Песнь 10,...,Песнь 15,Песнь 16,Песнь 17,Песнь 18,Песнь 19,Песнь 20,Песнь 21,Песнь 22,Песнь 23,Песнь 24
cнова,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011081
абантов,0.0,0.02783,0.0,0.015317,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
абарбареей,0.0,0.0,0.0,0.0,0.0,0.017013,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
абаса,0.0,0.0,0.0,0.0,0.010613,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
абида,0.0,0.0,0.0,0.017308,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


 В слове 'снова' первая буква английская, поэтому оно в начале таблицы

In [8]:
df.to_csv('gomer_tf_idf.csv', encoding='utf-8')

### 2. Custom

In [9]:
import collections

def tf(text):
    tokens = text.split()
    tf_dict = dict(collections.Counter(tokens))
    for key in tf_dict.keys():
        tf_dict[key] = tf_dict[key] / len(tokens)
    return tf_dict
    
def idf(documents, words):
    df_dict = {
        word: 0
        for word in words
    }
    for document in documents:
        tokens = set(document.split())
        for token in tokens:
            df_dict[token] = df_dict[token] + 1
    for word in words:
        df_dict[word] = math.log(len(documents) / df_dict[word])
    
    return df_dict
        
def get_all_words_from_text(texts):
    words = set()
    for text in texts:
        for token in text.split():
            words.add(token)
            
    return sorted(words)

def tf_idf(documents):
    words = get_all_words_from_text(documents)
    idf_dict = idf(documents, words)
    tf_idf_table = [
        [0 for _ in range(len(documents))]
        for word in words
    ]

    df = pd.DataFrame(tf_idf_table)
    df.index = words
    df.rename({i: 'Песнь {}'.format(i + 1) for i in range(len(df.columns))}, axis=1, inplace=True)
    for i, document in enumerate(documents):
        tf_idf_dict = tf(document)
        for word in tf_idf_dict.keys():
            tf_idf_dict[word] = tf_idf_dict[word] * idf_dict[word]
        
        df[df.columns[i]] = df.index.to_series().map(tf_idf_dict)
            
    df.fillna(0, inplace=True)
    
    return df

In [10]:
df = tf_idf(transformed_songs)
df.head()

Unnamed: 0,Песнь 1,Песнь 2,Песнь 3,Песнь 4,Песнь 5,Песнь 6,Песнь 7,Песнь 8,Песнь 9,Песнь 10,...,Песнь 15,Песнь 16,Песнь 17,Песнь 18,Песнь 19,Песнь 20,Песнь 21,Песнь 22,Песнь 23,Песнь 24
cнова,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000503
а,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
абантов,0.0,0.001121,0.0,0.000602,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
абарбареей,0.0,0.0,0.0,0.0,0.0,0.000785,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
абаса,0.0,0.0,0.0,0.0,0.000457,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
df.to_csv('gomer_tf_idf.csv', encoding='utf-8')

 Итоговые значения отличаются, поскольку в sklearnе TF-IDF работает немного по-другому