In [99]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [133]:
from gensim.corpora.dictionary import Dictionary
import re
import numpy as np
from nltk.corpus import stopwords
from razdel import tokenize
import pymorphy2
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix
import itertools
import ast

In [49]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    text = re.sub("n", ' ', text)

    
    #tokens = list(tokenize(text))
    #words = [_.text for _ in tokens]
    #words = [w for w in words if w not in stopword_ru]
    
    #return " ".join(words)
    return text

cache = {}

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    #print(tokens)
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
                #print(temp_cach)
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
                #print(w,' : ',temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    #print(words_lem_without_stopwords)
    return words_lem_without_stopwords

def get_lda_vector(text):
    '''
    векторное представление новости
    '''
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(N_topic):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() 
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [35]:
news = pd.read_csv('articles.csv')
users = pd.read_csv('users_arrticles.csv')
target = pd.read_csv('users_churn.csv')

In [4]:
stopword_ru = stopwords.words('russian')

with open('stopwords.txt') as f:
    add_stopwords = [w.strip() for w in f.readlines() if w]
stopword_ru += add_stopwords
len(stopword_ru)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/antonromaskin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Обработка текста

In [16]:
%%time
# очистка текста
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

CPU times: user 22.5 s, sys: 172 ms, total: 22.7 s
Wall time: 22.8 s


In [17]:
%%time
# лемматизация текста
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

CPU times: user 3min 9s, sys: 374 ms, total: 3min 10s
Wall time: 3min 10s


In [18]:
#с формируем список наших текстов, разбив еще и на пробелы
texts = [t for t in news['title'].values]

# создаём коллекцию текстов
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

In [19]:
# будем разбивать текст на 10 тем
N_topic = 10

In [20]:
# импортируем модель
from gensim.models import LdaModel

CPU times: user 44.1 s, sys: 3.69 s, total: 47.8 s
Wall time: 23.1 s


In [None]:
# обучаем модель на коллекции текстов
lda = LdaModel(common_corpus, num_topics=N_topic, id2word=common_dictionary)#, passes=10)

In [21]:
# сохраняем модель на диск
from gensim.test.utils import datapath
temp_file = datapath("model.lda")
lda.save(temp_file)

In [None]:
# загрузка ммодели с диска
lda = LdaModel.load(temp_file)

In [22]:
# создание новой коллекции из ранее не встречавшихся документов
other_texts = [t for t in news['title'].iloc[:3]]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]

In [23]:
# просмотр списка тем из полученной модели

x=lda.show_topics(num_topics=N_topic, num_words=15,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
 
for topic,words in topics_words:
    print("topic_{}: ".format(topic)+" ".join(words))

topic_0: газета ru рынок новый санкция сотрудник всё первый глава снижение решение доход экономика власть газпром
topic_1: фонд вода остров снимок рейтинг озеро следователь флот планета британский сосед бежать океан польский польша
topic_2: россия российский сша украина американский правительство рост сторона глава космический журнал эксперт нефть новый вопрос
topic_3: рубль млн банк млрд руб составить клетка цена тыс размер валюта сумма статья стоимость тело
topic_4: фестиваль гражданин конкурс дональд достигать хороший тенденция туроператор устойчивый игра россиянин тур величина египетский диск
topic_5: ракета марс египет превысить пуск соцсеть журнал звёздный понятие запуск es модификация спешить перо каир
topic_6: ребёнок газ всё день мужчина исследователь жизнь выяснить семья случай врач обнаружить найти произойти изз
topic_7: самолёт военный корабль турист район операция армия технология фонд помощь обнаружить система данные область экипаж
topic_8: учёный проект исследование земл

In [25]:
# создаём матрицу тем
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(N_topic)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(N_topic)]]

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9
0,6,0.771234,0.0,0.0,0.0,0.129235,0.0,0.092844,0.0,0.0,0.0
1,4896,0.638109,0.0,0.0,0.0,0.340806,0.0,0.0,0.0,0.0,0.0
2,4897,0.0,0.070815,0.0,0.0,0.670467,0.0,0.0,0.0,0.113932,0.128576
3,4898,0.0,0.0,0.0,0.0,0.157088,0.0,0.038497,0.0,0.0,0.795411
4,4899,0.0,0.0,0.264416,0.0,0.0,0.0,0.0,0.307397,0.0,0.407499


In [None]:
topic_matrix.head(5)

In [26]:
# словарь распределения текстов-тем
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(N_topic)]].values))

#### MEDIAN

In [122]:
# def get_user_embedding(user_articles_list):
    user_articles_list = ast.literal_eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector1 = np.median(user_vector, 0)
    return user_vector1 

In [123]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(N_topic)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(N_topic)]]

In [124]:
X = pd.merge(user_embeddings, target, 'left')

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,churn
0,u105138,0.0,0.015514,0.182512,0.044483,0.0,0.0,0.074867,0.0,0.176407,0.117138,0
1,u108690,0.179448,0.0,0.198227,0.083767,0.0,0.0,0.259687,0.021152,0.032242,0.053066,1
2,u108339,0.218373,0.006106,0.08047,0.108476,0.0,0.0,0.23662,0.115035,0.05276,0.02064,1


In [125]:
# разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(N_topic)]], 
                                                    X['churn'], random_state=0, test_size=0.25)

In [126]:
# обучим модель
logreg = LogisticRegression(C=1.0)
logreg.fit(X_train, y_train)

LogisticRegression()

In [127]:
# получаем прогноз для тестовой выборки
preds = logreg.predict_proba(X_test)[:, 1]

array([0.0393335 , 0.10496361, 0.92822041, 0.1481409 , 0.13181818,
       0.18433829, 0.26201603, 0.10949956, 0.07534441, 0.02121908])

In [130]:
precision, recall, thresholds = precision_recall_curve(y_test.values[:1000], preds[:1000])

fscore = (2 * precision * recall) / (precision + recall)

# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.288260, F-Score=0.723, Precision=0.658, Recall=0.803


In [134]:
roc_auc = roc_auc_score(y_test, preds)

In [147]:
data = {'Type': ['median'], 'F-Score': [fscore[ix]], 'Precision': [precision[ix]], 'Recall': [recall[ix]], 'Roc_auc_score': [roc_auc]}
median_embedding = pd.DataFrame(data)

In [146]:
median_embedding

Unnamed: 0,Type,F-Score,Precision,Recall,Roc_auc_score
0,median_embedding,0.723247,0.657718,0.803279,0.959923


#### MAX

In [149]:
def get_user_embedding(user_articles_list):
    user_articles_list = ast.literal_eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector1 = np.max(user_vector, 0)
    return user_vector1

user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(N_topic)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(N_topic)]]

X = pd.merge(user_embeddings, target, 'left')

# разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(N_topic)]], 
                                                    X['churn'], random_state=0, test_size=0.25)

# обучим модель
logreg = LogisticRegression(C=1.0)
logreg.fit(X_train, y_train)

# получаем прогноз для тестовой выборки
preds = logreg.predict_proba(X_test)[:, 1]

precision, recall, thresholds = precision_recall_curve(y_test.values[:1000], preds[:1000])

fscore = (2 * precision * recall) / (precision + recall)

# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

roc_auc = roc_auc_score(y_test, preds)

data = {'Type': ['max'], 'F-Score': [fscore[ix]], 'Precision': [precision[ix]], 'Recall': [recall[ix]], 'Roc_auc_score': [roc_auc]}
max_embedding = pd.DataFrame(data)

Best Threshold=0.421572, F-Score=0.647, Precision=0.682, Recall=0.615


#### MEAN

In [152]:
def get_user_embedding(user_articles_list):
    user_articles_list = ast.literal_eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector1 = np.mean(user_vector, 0)
    return user_vector1

user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(N_topic)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(N_topic)]]

X = pd.merge(user_embeddings, target, 'left')

# разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(N_topic)]], 
                                                    X['churn'], random_state=0, test_size=0.25)

# обучим модель
logreg = LogisticRegression(C=1.0)
logreg.fit(X_train, y_train)

# получаем прогноз для тестовой выборки
preds = logreg.predict_proba(X_test)[:, 1]

precision, recall, thresholds = precision_recall_curve(y_test.values[:1000], preds[:1000])

fscore = (2 * precision * recall) / (precision + recall)

# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

roc_auc = roc_auc_score(y_test, preds)

data = {'Type': ['mean'], 'F-Score': [fscore[ix]], 'Precision': [precision[ix]], 'Recall': [recall[ix]], 'Roc_auc_score': [roc_auc]}
mean_embedding = pd.DataFrame(data)

Best Threshold=0.351581, F-Score=0.697, Precision=0.706, Recall=0.689


### Сравниваем результаты


In [157]:
pd.concat([median_embedding, mean_embedding, max_embedding])

Unnamed: 0,Type,F-Score,Precision,Recall,Roc_auc_score
0,median,0.723247,0.657718,0.803279,0.959923
0,mean,0.697095,0.705882,0.688525,0.958244
0,max,0.646552,0.681818,0.614754,0.935817


#### TF-IDF

In [158]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)

In [None]:
news['title'][1]