# Homework 2

In [1]:
import pandas as pd

import re
import numpy as np
from gensim.corpora.dictionary import Dictionary
from razdel import tokenize # сегментация русскоязычного текста на токены и предложения
import pymorphy2

In [2]:
news = pd.read_csv('articles.csv')
print(news.shape)
news.head(2)

(27000, 2)


Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...


In [3]:
news.iloc[2]['title']

'Форвард «Авангарда» Томаш Заборский прокомментировал игру своей команды в матче чемпионата КХЛ против «Атланта»n(4:3)n.nn«Мы провели плохой матч в Нижнем Новгороде против «Торпедо» и настраивались, что с первых же минут включимся в работу, — сказал Заборский. — У нас получилось забросить быстрый гол и задать хороший темп поединку. Мы можем играть еще лучше, но, с другой стороны, пять очков на выезде из девяти — это лучше, чем ничего».'

In [4]:
users = pd.read_csv('users_articles.csv')
users.sample(3)

Unnamed: 0,uid,articles
2094,u108186,"[3440, 842, 1216, 2014, 1121, 324035]"
3920,u108505,"[1616, 2338, 2329, 2452, 1518, 322176]"
5827,u107070,"[321904, 323516, 323935, 321874, 323083, 470857]"


In [5]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adurz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
stopword_ru = stopwords.words('russian')
print(len(stopword_ru))

151


In [7]:
with open('stopwords.txt', encoding='utf-8') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]

In [8]:
stopword_ru += additional_stopwords
len(stopword_ru)

776

In [9]:
# очистка текста

def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
        
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub('-\s\r\n\|-\s\r\n|\r\n', '', str(text))
    
    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    text = re.sub('n', ' ', text)
    
    return text

In [10]:
cache = {}
morph = pymorphy2.MorphAnalyzer()

In [11]:
# Лемматизация

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист лемматизированых токенов
    '''
    
    # [0]
    if not isinstance(text, str):
        text = str(text)
        
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]
    
    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w) > 1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords = [i for i in words_lem if not i in stopword_ru]
    
    return words_lem_without_stopwords

In [12]:
news['title'].iloc[:2].apply(lambda x: clean_text(x))

  text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)


0    заместитель председателя правительства рф серг...
1    матч  финала кубка россии по футболу был приос...
Name: title, dtype: object

In [13]:
%%time

from tqdm import tqdm
tqdm.pandas()

news['title'] = news['title'].progress_apply(clean_text)

100%|██████████████████████████████████████████████████████████████████████████| 27000/27000 [00:26<00:00, 1034.18it/s]

Wall time: 26.2 s





In [14]:
news['title'].iloc[:2]

0    заместитель председателя правительства рф серг...
1    матч  финала кубка россии по футболу был приос...
Name: title, dtype: object

In [15]:
news['title'].iloc[:2].apply(lambda x: lemmatization(x))

0    [заместитель, председатель, правительство, рф,...
1    [матч, финал, кубок, россия, футбол, приостано...
Name: title, dtype: object

In [16]:
%%time

news['title'] = news['title'].progress_apply(lemmatization)

100%|███████████████████████████████████████████████████████████████████████████| 27000/27000 [02:55<00:00, 153.65it/s]

Wall time: 2min 55s





In [17]:
# формируем список наших текстов
texts = list(news['title'].values)

In [18]:
# Создаем корпус из списка с текстами
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

In [19]:
len(common_dictionary)

135645

In [20]:
# Количество тем

N_topic = 20

In [24]:
# %%time
from gensim.models import LdaModel

# # Обучаем модель на корпусе
# lda = LdaModel(common_corpus, num_topics=N_topic, id2word=common_dictionary
#                , passes=10
#               )

In [25]:
# from gensim.test.utils import datapath

# # Сохраняем модель на диск
# temp_file = datapath('model.lda')
# lda.save(temp_file)

In [26]:
# загружаем обученную модель с диска
lda = LdaModel.load(temp_file)

In [27]:
# Создаем новый корпус документов, которые раньше не видели

other_texts = list(news['title'].iloc[:3])
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]

unseen_doc = other_corpus[1]
print(other_texts[1])
pfs = lda[unseen_doc]

['матч', 'финал', 'кубок', 'россия', 'футбол', 'приостановить', 'судья', 'изз', 'взрыв', 'пиротехнический', 'снаряд', 'передавать', 'корреспондент', 'газета', 'ru', 'болельщик', 'выбросить', 'поле', 'петарда', 'судья', 'увести', 'команда', 'поле', 'подтрибунный', 'помещение', 'динамовец', 'уйти', 'торпедовец', 'остаться', 'кромка', 'поле', 'матч', 'остановить', 'пять', 'минута', 'газета', 'ru', 'вести', 'онлайнтрансляция', 'матч']


In [28]:
x = lda.show_topics(num_topics=N_topic, num_words=7, formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

In [29]:
# Печатаем только слова
for topic, words in topics_words:
    print(f'topic_{topic}: ' + ' '.join(words))

topic_0: газета всё исследование ru система новый научный
topic_1: украина украинский киев восток экипаж остров пётр
topic_2: всё день очень жизнь хороший большой температура
topic_3: иран треть кремль автобус железный новак корь
topic_4: медведев законодательство медицина лодка юрист грузия сценарий
topic_5: земля обнаружить область день место погибнуть выяснить
topic_6: ребёнок женщина мужчина сотрудник семья дом журнал
topic_7: игра команда место определение россия золото таиланд
topic_8: край энергия ступень краснодарский задать практический тесный
topic_9: наука москва проект программа мероприятие новый площадь
topic_10: район взрыв турция удар произойти армия поверхность
topic_11: россия российский глава вопрос сторона государство путин
topic_12: британский великобритания франция обращение французский миссия американский
topic_13: газ продукция спрос фестиваль европа германия участник
topic_14: сша военный российский американский россия новый китай
topic_15: северный южный корея 

In [30]:
# векторное представление новости

def get_lda_vector(lda, text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))
    
    output_vector = []
    for i in range(N_topic):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [31]:
get_lda_vector(lda, news['title'].iloc[0])

array([0.        , 0.        , 0.04545455, 0.01010877, 0.        ,
       0.01579277, 0.        , 0.09375294, 0.        , 0.        ,
       0.        , 0.11321323, 0.        , 0.        , 0.        ,
       0.        , 0.4015663 , 0.        , 0.        , 0.31384969])

In [32]:
%%time
topic_matrix = pd.DataFrame([get_lda_vector(lda, text) for text in news['title'].values])

Wall time: 31.3 s


In [33]:
topic_matrix.columns = [f'topic_{i}' for i in range(N_topic)]

In [35]:
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id'] + [f'topic_{i}' for i in range(N_topic)]]
topic_matrix.sample(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
2546,7441,0.026569,0.0,0.504659,0.0,0.018131,0.047345,0.0,0.196894,0.0,...,0.02102,0.152618,0.0,0.0,0.023794,0.0,0.0,0.0,0.0,0.0
25719,495340,0.369012,0.0,0.101668,0.0,0.0,0.220534,0.029204,0.0,0.0,...,0.0,0.063928,0.0,0.027229,0.0,0.0,0.0,0.17401,0.0,0.0
14000,477562,0.0,0.152639,0.0,0.0,0.096496,0.077251,0.025421,0.163655,0.0,...,0.027865,0.0,0.0,0.1179,0.0,0.025005,0.0,0.025764,0.0,0.0
8054,294440,0.189017,0.039236,0.02034,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.084189,0.099068,0.0,0.294871,0.0,0.0,0.0
5330,324093,0.0,0.067633,0.0,0.0,0.0,0.0,0.307944,0.0,0.0,...,0.11467,0.0,0.033246,0.0,0.0,0.0,0.171848,0.0,0.0,0.222906


## Векторное представление пользователей

In [53]:
users.shape

(8000, 2)

In [36]:
users.sample(3)

Unnamed: 0,uid,articles
5369,u101144,"[6684, 7739, 7088, 6312, 6077, 495267]"
2888,u107040,"[322436, 322130, 322212, 322372, 322483, 471634]"
7030,u107337,"[322149, 324651, 324314, 323550, 322359, 474821]"


In [37]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[[f'topic_{i}' for i in range(N_topic)]].values))

In [39]:
doc_dict[47]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.04822944, 0.        , 0.04763711, 0.        , 0.0449315 ,
       0.        , 0.41901779, 0.        , 0.        , 0.02385179,
       0.        , 0.40836   , 0.        , 0.        , 0.        ])

In [69]:
def get_user_embedding(user_articles_list, doc_dict):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
#     print(user_vector)
    user_vector = np.mean(user_vector, axis=0)
    return user_vector

In [70]:
user_articles_list = users['articles'].iloc[33]

get_user_embedding(user_articles_list, doc_dict)

array([0.15226346, 0.00257361, 0.01002406, 0.        , 0.00218105,
       0.02691051, 0.13687664, 0.        , 0.        , 0.02624031,
       0.00286789, 0.25497293, 0.01493789, 0.01063793, 0.14004493,
       0.        , 0.12734335, 0.04528635, 0.0021146 , 0.03049206])

In [71]:
%%time
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x, doc_dict))])
user_embeddings.columns = [f'topic_{i}' for i in range(N_topic)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid'] + [f'topic_{i}' for i in range(N_topic)]]

Wall time: 460 ms


In [72]:
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,u105138,0.17846,0.044048,0.030954,0.0,0.0,0.049634,0.118016,0.019113,0.002872,...,0.046803,0.041865,0.028232,0.054681,0.014194,0.002827,0.131852,0.054415,0.0,0.025899
1,u108690,0.181981,0.043422,0.092483,0.001918,0.002519,0.041169,0.127751,0.0,0.0,...,0.02165,0.22667,0.014644,0.004154,0.042968,0.004813,0.079434,0.037582,0.0,0.024498
2,u108339,0.11541,0.0,0.071666,0.0,0.0,0.137762,0.183367,0.0,0.002144,...,0.039647,0.103544,0.013548,0.015642,0.048216,0.0,0.121698,0.026042,0.0,0.052754


In [74]:
target = pd.read_csv('users_churn.csv')
target.head(3)

Unnamed: 0,uid,churn
0,u107120,0
1,u102277,0
2,u102444,0
