In [4]:
import re

import pandas as pd
import pymorphy2

text_df = pd.read_csv("content_description.csv", sep='\t')
text_df.head()

# break the test into words
corpus = []
# reg expression for searching words
regular_expr = r'\w+'
reg_expr_compiled = re.compile(regular_expr)

# form a dataset from individual words
for raw_text in text_df.description.values:
    # convert to lowercase
    raw_text_lower = raw_text.lower()
    # break text into words
    text_by_words = reg_expr_compiled.findall(raw_text_lower) 
    corpus.append(text_by_words)
    
# text normalization
normalized_corpus = []
morph = pymorphy2.MorphAnalyzer()
# normalize each word in the text
for token_list in corpus:
    normalized_token_list = []
    for word in token_list:
        parsed_token = morph.parse(word)
        normal_form = parsed_token[0].normal_form
        normalized_token_list.append(normal_form)
    normalized_corpus.append(normalized_token_list)

#  convert to DataFrame
doc_count = len(normalized_corpus)
doc_ids = []
tokens = []
# we form two list-columns of the dataframe
for doc_id in range(doc_count):
    for token in normalized_corpus[doc_id]:
        doc_ids.append(doc_id)
        tokens.append(token)

tokens_df = pd.DataFrame({
    'doc_id': doc_ids,
    'word': tokens
})

tokens_df = tokens_df.assign(dummy = 1)
# aggregate statistics
word_count_df = tokens_df.groupby(['doc_id','word'])['dummy'].count().reset_index()

word_count_df[word_count_df.doc_id==0].sort_values(by='dummy', ascending=False).head(10)


Unnamed: 0,doc_id,word,dummy
6,0,в,10
29,0,и,7
37,0,который,5
111,0,шерлок,4
77,0,с,4
84,0,сериал,4
75,0,риколетти,3
31,0,история,2
21,0,для,2
49,0,о,2


# Homework

**Easy Level Task** For each word, calculate the percentage of documents that contain that word. Form dataframe doc_frequency. The percentage of documents is calculated by the formula

$$
d = \frac{m}{n} \times 100
$$
where $m$ is the number of documents in which this word occurs, and $n$ is the total number of documents. Round the percentage to the nearest integer.

See what words are in the top - these are prepositions and particles. Such words are present in all documents, which means that they can be removed from the text so that the remaining words are more "meaningful". This will make text analysis easier.


In [48]:
docs_amount = len(word_count_df['doc_id'].unique())
words = word_count_df.groupby('word').agg({'doc_id':'count'})
words

Unnamed: 0_level_0,doc_id
word,Unnamed: 1_level_1
1895,1
1980,1
1983,1
2,1
kingsman,1
...,...
юношеский,1
являться,1
япония,1
японский,1


**Intermediate level task**

Use regular expressions to extract all **firstname+lastname** pairs from text.

* informal description of the regular expression: a pair of words following each other, each of which begins with a capital letter
* only `doc_id=3` needs to be parsed
* the text is taken from the source dataframe `text_df`
* capital letter in Russian text corresponds to character class `r'[A-Z]*'`


In [70]:
text_df = pd.read_csv("content_description.csv", sep='\t')

raw_text = text_df.description.values[3]

# write the required regular expression to the variable reg_expr 
reg_expr = r'\w+\s+([А-Я]\w+\s[А-Я]\w+)'

# add the compiled expression to the reg_expr_compiled object
reg_expr_compiled = re.compile(reg_expr)

# apply expression to text
for g in reg_expr_compiled.findall(raw_text):
    print(g)


Стивена Фрая
Джоном Дженксом
Теда Уоллеса


**Difficult level task**. The `genre_dict` dictionary contains words that are specific to a particular genre. Using the dictionary as well as the `word_count_df` table, create a table with two columns `doc_id | genre` with movie genres.

For this you need
* turn the `genre_dict` dictionary into a `word format DataFrame | genre`, save to vars `genres_df`
* connect the received dataframe using the `merge` function with the previously received dataframe `word_count_df`, which contains the distribution of words across documents. Use the join method `inner`
* for each document, select the genre of the document - this is a combination of genres of words of individual words. One content can have multiple genres


In [7]:
import itertools
import pandas as pd

genre_dict = {
    'комедия': ['сатирический', 'авантюрный', 'забавный'],
    'мелодрама': ['выбор', 'позор'],
    'сказка': ['приключения', 'милый', 'семейный'],
    'детектив': ['тайна', 'разгадать', 'загадочный'], 
    'триллер': ['ужас', 'зловещий', 'нерв']
}
gg = [[(i, j) for j in genre_dict[i]] for i in genre_dict]
hh = list(itertools.chain(*gg))
genres_df = pd.DataFrame(hh, columns=['genre', 'word'])


genres_df

Unnamed: 0,genre,word
0,комедия,сатирический
1,комедия,авантюрный
2,комедия,забавный
3,мелодрама,выбор
4,мелодрама,позор
5,сказка,приключения
6,сказка,милый
7,сказка,семейный
8,детектив,тайна
9,детектив,разгадать


In [8]:
merged_df = genres_df.merge(word_count_df, how = 'inner', left_on='word', right_on='word')
merged_df

Unnamed: 0,genre,word,doc_id,dummy
0,комедия,сатирический,3,1
1,комедия,авантюрный,4,1
2,комедия,забавный,4,1
3,мелодрама,выбор,2,1
4,мелодрама,позор,2,1
5,сказка,милый,1,1
6,детектив,тайна,0,1
7,детектив,разгадать,0,1
8,детектив,загадочный,0,1
9,триллер,ужас,0,1


In [9]:
merged_df.groupby('doc_id')['genre'].unique().reset_index()

Unnamed: 0,doc_id,genre
0,0,"[детектив, триллер]"
1,1,[сказка]
2,2,[мелодрама]
3,3,[комедия]
4,4,[комедия]
5,6,[триллер]
