In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import roc_curve, auc
from sklearn.svm import SVC
from scipy.sparse import lil_matrix
from scipy import interp
from itertools import cycle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

# 0) Приведем данные к норм формату, удалим повторяющиеся строки

In [2]:
fin = open("parse_date_morph.txt", "r")
fout = open("data_morph.txt", "w")

In [3]:
text2 = []
for s in fin.readlines():
    new_s = s
    indexes_coma = []
    for i, c in enumerate(s):
        if c == ",":
            indexes_coma.append(i)
    for i in indexes_coma[1:-1]:
        new_s = new_s[:i] + '%' + new_s[i + 1:]
    text2.append(new_s)
print(len(text2))
text2 = list(set(text2))
print(len(text2))
print(text2[0])

57586
53984
unicorn_thunder74,[на%на%PREP] [подъезде%подъезд%NOUN] [нашего%наш%ADJF] [дома%дом%NOUN] [миша%миша%NOUN] [аж%аж%CONJ] [сигаретой%сигарета%NOUN] [подавился%подавиться%VERB] ,P



In [4]:
fout.write("user,sentence,y\n" + ''.join(text2))

9071774

In [5]:
fin.close()
fout.close()

# 1) Загрузим данные

In [6]:
data = pd.read_csv("data_morph.txt")
data.head()

Unnamed: 0,user,sentence,y
0,unicorn_thunder74,[на%на%PREP] [подъезде%подъезд%NOUN] [нашего%н...,P
1,__gusenova_,[какая%какой%ADJF] [красота%красота%NOUN],P
2,diamonte_showroom_armenia,[сколько%сколько%CONJ],P
3,____rimmulya____,[тина%тина%NOUN],P
4,thekatetrump,[это%это%PRCL] [был%быть%VERB] [круто%круто%AD...,P


# 2) Преобразуем их в признаки

In [7]:
def get_words_and_bigrams(sentences):
    set_words_form = set()
    set_words_lemma = set()
    set_bigrams_form = set()
    set_bigrams_lemma = set()
    set_bigrams_all_lemma = set()
    for sentence in sentences:
        last_word = ''
        for word in sentence.strip().split():
            form, lemma, pos = word[1:-1].split('%')
            if pos != "None":
                set_words_form.add((form,))
                set_words_lemma.add((lemma,pos,))
                if last_word:
                    set_bigrams_form.add((last_word[0], pos,))
                    set_bigrams_lemma.add((last_word[1], pos,))
                    set_bigrams_form.add((last_word[1][1], form,))
                    set_bigrams_lemma.add((last_word[1][1], (lemma,pos,)))
                    set_bigrams_all_lemma.add((last_word[1], (lemma,pos,)))
                last_word = (form, (lemma,pos,))
            else:
                last_word = ''
    list_words_form, list_words_lemma, list_bigrams_form, list_bigrams_lemma, list_bigrams_all_lemma = (list(set_words_form), list(set_words_lemma), list(set_bigrams_form), list(set_bigrams_lemma), list(set_bigrams_all_lemma))
    dict_words_form = {word: i  for i, word in enumerate(list_words_form)}
    dict_words_lemma = {word: i  for i, word in enumerate(list_words_lemma)}
    dict_bigrams_form = {word: i  for i, word in enumerate(list_bigrams_form)}
    dict_bigrams_lemma = {word: i  for i, word in enumerate(list_bigrams_lemma)}
    dict_bigrams_all_lemma = {word: i  for i, word in enumerate(list_bigrams_all_lemma)}
    return list_words_form, list_words_lemma, list_bigrams_form, list_bigrams_lemma, list_bigrams_all_lemma, dict_words_form, dict_words_lemma, dict_bigrams_form, dict_bigrams_lemma, dict_bigrams_all_lemma
    
def make_table(users, sentences, y):
    list_words_form, list_words_lemma, list_bigrams_form, list_bigrams_lemma, list_bigrams_all_lemma, dict_words_form, dict_words_lemma, dict_bigrams_form, dict_bigrams_lemma, dict_bigrams_all_lemma = get_words_and_bigrams(sentences)
    matrix_words_form = lil_matrix((len(sentences), len(list_words_form)))
    matrix_words_lemma = lil_matrix((len(sentences), len(list_words_lemma)))
    matrix_bigrams_form = lil_matrix((len(sentences), len(list_bigrams_form)))
    matrix_bigrams_lemma = lil_matrix((len(sentences), len(list_bigrams_lemma)))
    matrix_bigrams_all_lemma = lil_matrix((len(sentences), len(list_bigrams_all_lemma)))
    stat_words_form = np.array([[0, 0, set()] for x in list_words_form])
    stat_words_lemma = np.array([[0, 0, set()] for x in list_words_lemma])
    stat_bigrams_form = np.array([[0, 0, set()] for x in list_bigrams_form])
    stat_bigrams_lemma = np.array([[0, 0, set()] for x in list_bigrams_lemma])
    stat_bigrams_all_lemma = np.array([[0, 0, set()] for x in list_bigrams_all_lemma])
    for i, sentence in enumerate(sentences):
        last_word = ''
        for word in sentence.strip().split():
            form, lemma, pos = word[1:-1].split('%')
            if pos != "None":
                matrix_words_form[i, dict_words_form[(form,)]] += 1
                matrix_words_lemma[i, dict_words_lemma[(lemma,pos,)]] += 1
                stat_words_form[dict_words_form[(form,)]][0 if y[i] == 1 else 1] += 1
                stat_words_lemma[dict_words_lemma[(lemma,pos,)]][0 if y[i] == 1 else 1] += 1
                stat_words_form[dict_words_form[(form,)]][2].add(users[i])
                stat_words_lemma[dict_words_lemma[(lemma,pos,)]][2].add(users[i])
                if last_word:
                    matrix_bigrams_form[i, dict_bigrams_form[(last_word[0], pos,)]] += 1
                    matrix_bigrams_lemma[i, dict_bigrams_lemma[(last_word[1], pos,)]] += 1
                    matrix_bigrams_form[i, dict_bigrams_form[(last_word[1][1], form)]] += 1
                    matrix_bigrams_lemma[i, dict_bigrams_lemma[(last_word[1][1], (lemma,pos,),)]] += 1
                    matrix_bigrams_all_lemma[i, dict_bigrams_all_lemma[(last_word[1], (lemma,pos,),)]] += 1
                    stat_bigrams_form[dict_bigrams_form[(last_word[0], pos,)]][0 if y[i] == 1 else 1] += 1
                    stat_bigrams_lemma[dict_bigrams_lemma[(last_word[1], pos,)]][0 if y[i] == 1 else 1] += 1
                    stat_bigrams_form[dict_bigrams_form[(last_word[0], pos,)]][2].add(users[i])
                    stat_bigrams_lemma[dict_bigrams_lemma[(last_word[1], pos,)]][2].add(users[i])
                    stat_bigrams_form[dict_bigrams_form[(last_word[1][1], form)]][0 if y[i] == 1 else 1] += 1
                    stat_bigrams_lemma[dict_bigrams_lemma[(last_word[1][1], (lemma,pos,))]][0 if y[i] == 1 else 1] += 1
                    stat_bigrams_form[dict_bigrams_form[(last_word[1][1], form)]][2].add(users[i])
                    stat_bigrams_lemma[dict_bigrams_lemma[(last_word[1][1], (lemma,pos,))]][2].add(users[i])
                    stat_bigrams_all_lemma[dict_bigrams_all_lemma[(last_word[1], (lemma,pos,))]][0 if y[i] == 1 else 1] += 1
                    stat_bigrams_all_lemma[dict_bigrams_all_lemma[(last_word[1], (lemma,pos,))]][2].add(users[i])                    
                last_word = (form, (lemma, pos))
            else:
                last_word = ''
    stat_words_form = np.array([(x[0], x[1], len(x[2])) for x in stat_words_form])
    stat_words_lemma = np.array([(x[0], x[1], len(x[2])) for x in stat_words_lemma])
    stat_bigrams_form = np.array([(x[0], x[1], len(x[2])) for x in stat_bigrams_form])
    stat_bigrams_lemma = np.array([(x[0], x[1], len(x[2])) for x in stat_bigrams_lemma])
    stat_bigrams_all_lemma = np.array([(x[0], x[1], len(x[2])) for x in stat_bigrams_all_lemma])
    return matrix_words_form, matrix_words_lemma, matrix_bigrams_form, matrix_bigrams_lemma, matrix_bigrams_all_lemma, list_words_form, list_words_lemma, list_bigrams_form, list_bigrams_lemma, list_bigrams_all_lemma, dict_words_form, dict_words_lemma, dict_bigrams_form, dict_bigrams_lemma, dict_bigrams_all_lemma, stat_words_form, stat_words_lemma, stat_bigrams_form, stat_bigrams_lemma, stat_bigrams_all_lemma


In [8]:
y = np.array([1 if x == 'P' else -1 for x in data['y']])

In [9]:
matrix_words_form, matrix_words_lemma, matrix_bigrams_form, matrix_bigrams_lemma, matrix_bigrams_all_lemma, list_words_form, list_words_lemma, list_bigrams_form, list_bigrams_lemma, list_bigrams_all_lemma, dict_words_form, dict_words_lemma, dict_bigrams_form, dict_bigrams_lemma, dict_bigrams_all_lemma, stat_words_form, stat_words_lemma, stat_bigrams_form, stat_bigrams_lemma, stat_bigrams_all_lemma = make_table(data['user'], data['sentence'], y)

# 3) Теперь будем запускать алгоритм логистичекой регрессии, чтобы восстанавливать y по матрицам (4 разные матрицы).
## Будем максимизировать метрику ROC-AUC на кросс-валидации, чтобы подобрать оптимальный коэффициент C.

In [10]:
def cross_val_roc_auc(classifier, X, y):
    cv = StratifiedKFold(n_splits=6)

    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)

    colors = cycle(['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange'])
    lw = 2

    i = 0
    #plt.figure(figsize=(15, 15))
    for (train, test), color in zip(cv.split(X, y), colors):
        probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)
        #plt.plot(fpr, tpr, lw=lw, color=color,
        #         label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

        i += 1
    #plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k',
    #         label='Luck')

    mean_tpr /= cv.get_n_splits(X, y)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    return mean_auc
    '''plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--',
             label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw)

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()'''

Подбираем коэффициент $C$, максимизируя $ROC-AUC$ (метрика качества алгоритма классификации).

In [31]:
for C in [0.1, 0.18, 0.19, 0.2, 0.21, 0.22, 0.25]:
    model = LogisticRegression(penalty='l1', C=C, n_jobs=2, class_weight='balanced')
    print(C, cross_val_roc_auc(model, matrix_words_form, y))

0.1 0.745978700258
0.18 0.747780064086
0.19 0.74779615291
0.2 0.747747463532
0.21 0.747651378613
0.22 0.74760294144
0.25 0.746684907701


Лучший результат для форм слова - $C=0.19$, $ROC-AUC=0.74780$

In [32]:
for C in [0.1, 0.13, 0.14, 0.15, 0.16, 0.17, 0.2]:
    model = LogisticRegression(penalty='l1', C=C, n_jobs=2, class_weight='balanced')
    print(C, cross_val_roc_auc(model, matrix_words_lemma, y))

0.1 0.756653231266
0.13 0.758059518402
0.14 0.758095474946
0.15 0.758127546829
0.16 0.75829254606
0.17 0.758155463469
0.2 0.757828705935


Лучший результат для лемм слова - $C=0.16$, $ROC-AUC=0.75829$, то есть леммы лучше, чем формы слова

In [33]:
for C in [0.1, 0.13, 0.14, 0.15, 0.16, 0.2]:
    model = LogisticRegression(penalty='l1', C=C, n_jobs=2, class_weight='balanced')
    print(C, cross_val_roc_auc(model, matrix_bigrams_form, y))

0.1 0.679542815189
0.13 0.681131205203
0.14 0.681279228305
0.15 0.68083900189
0.16 0.680458307122
0.2 0.680590639391


Лучший результат для форм биграмм: $C=0.14$, $ROC-AUC=0.68128$, то есть биграмы гораздо хуже, чем слова

In [35]:
for C in [0.08, 0.09, 0.1, 0.11]:
    model = LogisticRegression(penalty='l1', C=C, n_jobs=2, class_weight='balanced')
    print(C, cross_val_roc_auc(model, matrix_bigrams_lemma, y))

0.08 0.684871938054
0.09 0.685754548699
0.1 0.686717709852
0.11 0.686467087105


Лучший результат для лемм биграмм: $C=0.1$, $ROC-AUC=0.68672$, то есть для биграмм тоже леммы лучше, чем формы

In [22]:
for C in [0.2, 0.3, 0.4]:
    model = LogisticRegression(penalty='l1', C=C, n_jobs=2, class_weight='balanced')
    print(C, cross_val_roc_auc(model, matrix_bigrams_all_lemma, y))

0.2 0.64924700943
0.3 0.653093413473
0.4 0.650396232896


Лучший результат для биграмм из лемм и лемм: $C=0.3$, $ROC-AUC=0.65309$

# 4) Запустим алгоритм на данных

In [11]:
model_words_form = LogisticRegression(penalty='l1', C=0.19, n_jobs=2, class_weight='balanced')
model_words_form.fit(matrix_words_form, y)

LogisticRegression(C=0.19, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=2, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [12]:
model_words_lemma = LogisticRegression(penalty='l1', C=0.16, n_jobs=2, class_weight='balanced')
model_words_lemma.fit(matrix_words_lemma, y)

LogisticRegression(C=0.16, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=2, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [13]:
model_bigrams_form = LogisticRegression(penalty='l1', C=0.14, n_jobs=2, class_weight='balanced')
model_bigrams_form.fit(matrix_bigrams_form, y)

LogisticRegression(C=0.14, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=2, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [18]:
model_bigrams_lemma = LogisticRegression(penalty='l1', C=0.10, n_jobs=2, class_weight='balanced')
model_bigrams_lemma.fit(matrix_bigrams_lemma, y)

LogisticRegression(C=0.1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=2, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [19]:
model_bigrams_all_lemma = LogisticRegression(penalty='l1', C=0.30, n_jobs=2, class_weight='balanced')
model_bigrams_all_lemma.fit(matrix_bigrams_all_lemma, y)

LogisticRegression(C=0.3, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=2, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

# 5) Посчитаем веса признаков в каждом случае

И статистику. Последние три числа - сколько раз встречается в положительных предложениях, отрицательных предложениях, и количество юзеров, упоминающих это.

In [20]:
weights_words_form = zip(model_words_form.coef_[0], list_words_form, stat_words_form[:,0], stat_words_form[:,1], stat_words_form[:, 2])
weights_words_form = pd.DataFrame(sorted([(x[0], str(x[1:])) for x in weights_words_form], reverse=True))

weights_words_lemma = zip(model_words_lemma.coef_[0], list_words_lemma, stat_words_lemma[:,0], stat_words_lemma[:,1], stat_words_lemma[:, 2])
weights_words_lemma = pd.DataFrame(sorted([(x[0], str(x[1:])) for x in weights_words_lemma], reverse=True))

weights_bigrams_form = zip(model_bigrams_form.coef_[0], list_bigrams_form, stat_bigrams_form[:,0], stat_bigrams_form[:,1], stat_bigrams_form[:, 2])
weights_bigrams_form = pd.DataFrame(sorted([(x[0], str(x[1:])) for x in weights_bigrams_form], reverse=True))

weights_bigrams_lemma = zip(model_bigrams_lemma.coef_[0], list_bigrams_lemma, stat_bigrams_lemma[:,0], stat_bigrams_lemma[:,1], stat_bigrams_lemma[:, 2])
weights_bigrams_lemma = pd.DataFrame(sorted([(x[0], str(x[1:])) for x in weights_bigrams_lemma], reverse=True))

weights_bigrams_all_lemma = zip(model_bigrams_all_lemma.coef_[0], list_bigrams_all_lemma, stat_bigrams_all_lemma[:,0], stat_bigrams_all_lemma[:,1], stat_bigrams_all_lemma[:, 2])
weights_bigrams_all_lemma = pd.DataFrame(sorted([(x[0], str(x[1:])) for x in weights_bigrams_all_lemma], reverse=True))


# 6) Помотрим на топы и антитопы получившихся данных

In [21]:
weights_words_form[:5]

Unnamed: 0,0,1
0,2.93021,"(('классная',), 249, 0, 228)"
1,2.689702,"(('красотка',), 644, 2, 541)"
2,2.321647,"(('молодец',), 333, 2, 303)"
3,2.309099,"(('красота',), 441, 3, 379)"
4,2.300137,"(('крутая',), 165, 1, 151)"


In [22]:
weights_words_form[-5:]

Unnamed: 0,0,1
50657,-2.343322,"(('жалко',), 22, 30, 46)"
50658,-2.478386,"(('ужас',), 19, 24, 41)"
50659,-2.554006,"(('скучаю',), 73, 58, 86)"
50660,-2.751562,"(('скучать',), 20, 22, 28)"
50661,-2.851356,"(('заболела',), 6, 12, 17)"


In [23]:
weights_words_lemma[:5]

Unnamed: 0,0,1
0,2.18011,"(('красотка', 'NOUN'), 760, 6, 619)"
1,2.080134,"(('супер', 'ADJF'), 767, 5, 604)"
2,2.064284,"(('классный', 'ADJF'), 646, 6, 539)"
3,1.820746,"(('отличный', 'ADJF'), 289, 3, 207)"
4,1.814252,"(('клевый', 'ADJF'), 98, 0, 79)"


In [24]:
weights_words_lemma[-5:]

Unnamed: 0,0,1
26873,-2.012344,"(('лёша', 'NOUN'), 13, 6, 13)"
26874,-2.120322,"(('жаль', 'PRED'), 43, 43, 77)"
26875,-2.185488,"(('ужас', 'NOUN'), 33, 26, 52)"
26876,-2.292022,"(('скучать', 'VERB'), 137, 85, 129)"
26877,-2.31569,"(('жалко', 'ADVB'), 22, 30, 46)"


In [25]:
weights_bigrams_form[:5]

Unnamed: 0,0,1
0,1.962376,"(('ADVB', 'рождения'), 540, 3, 439)"
1,1.920046,"(('NOUN', 'супер'), 161, 0, 143)"
2,1.82355,"(('супер', 'NOUN'), 178, 0, 142)"
3,1.799118,"(('спасибо', 'PREP'), 444, 5, 215)"
4,1.793088,"(('ADJF', 'девушка'), 149, 0, 125)"


In [26]:
weights_bigrams_form[-5:]

Unnamed: 0,0,1
172884,-1.390189,"(('бедная', 'NOUN'), 4, 5, 8)"
172885,-1.417914,"(('NOUN', 'плакать'), 3, 6, 3)"
172886,-1.555725,"(('скучаю', 'PREP'), 26, 23, 37)"
172887,-1.784234,"(('NOUN', 'жалко'), 2, 10, 11)"
172888,-2.231088,"(('жаль', 'CONJ'), 25, 27, 51)"


In [27]:
weights_bigrams_lemma[:5]

Unnamed: 0,0,1
0,1.64095,"((('спасибо', 'INTJ'), 'PREP'), 444, 5, 215)"
1,1.585625,"((('отличный', 'ADJF'), 'NOUN'), 236, 3, 176)"
2,1.584459,"(('NOUN', ('супер', 'ADJF')), 161, 0, 143)"
3,1.513818,"((('супер', 'ADJF'), 'NOUN'), 178, 0, 142)"
4,1.458241,"((('спасибо', 'INTJ'), 'NOUN'), 350, 5, 135)"


In [28]:
weights_bigrams_lemma[-5:]

Unnamed: 0,0,1
111656,-1.23904,"((('где', 'ADVB'), 'ADJF'), 34, 11, 39)"
111657,-1.344383,"((('скучать', 'VERB'), 'PREP'), 48, 35, 53)"
111658,-1.479731,"((('бедный', 'ADJF'), 'NOUN'), 18, 12, 26)"
111659,-1.771428,"((('жаль', 'PRED'), 'CONJ'), 25, 27, 51)"
111660,-1.819844,"(('NOUN', ('жалко', 'ADVB')), 2, 10, 11)"


In [30]:
weights_bigrams_all_lemma[:5]

Unnamed: 0,0,1
0,2.738311,"((('для', 'PREP'), ('вы', 'NPRO')), 101, 0, 33)"
1,2.385554,"((('спасибо', 'INTJ'), ('за', 'PREP')), 425, 5..."
2,2.22628,"((('хотеть', 'VERB'), ('чтоб', 'CONJ')), 16, 0..."
3,2.179971,"((('очень', 'ADVB'), ('идти', 'VERB')), 103, 0..."
4,2.149074,"((('как', 'CONJ'), ('красиво', 'ADVB')), 87, 0..."


In [31]:
weights_bigrams_all_lemma[-5:]

Unnamed: 0,0,1
186988,-2.550515,"((('платье', 'NOUN'), ('но', 'CONJ')), 0, 5, 5)"
186989,-2.602826,"((('жаль', 'PRED'), ('что', 'CONJ')), 24, 27, 50)"
186990,-2.875362,"((('сегодня', 'ADVB'), ('бы', 'PRCL')), 0, 2, 1)"
186991,-3.023424,"((('признаваться', 'VERB'), ('у', 'PREP')), 0,..."
186992,-3.40504,"((('ангел', 'NOUN'), ('с', 'PREP')), 1, 1, 2)"


# 7) Запишем результаты

In [32]:
weights_words_form.to_csv("words_form_data.txt", sep=',', index=False)
weights_words_lemma.to_csv("words_lemma_data.txt", sep=',', index=False)
weights_bigrams_form.to_csv("bigrams_form_data.txt", sep=',', index=False)
weights_bigrams_lemma.to_csv("bigrams_lemma_data.txt", sep=',', index=False)
weights_bigrams_all_lemma.to_csv("bigrams_lemma_all_data.txt", sep=',', index=False)