# NLP

Classify Russian texts into several categories. It is best if the body of the texts is really large. To pre-process texts: normalization, lemmatization, etc. Compare embeddings. Try several classification methods.

## Import libs and load data

In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

import pymorphy2
morph = pymorphy2.MorphAnalyzer()

[nltk_data] Downloading package punkt to /home/akimg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/akimg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from string import punctuation

def lemmatize(input_text):
    tokens = nltk.word_tokenize(input_text)
    normed_tokens = [morph.parse(s)[0].normal_form for s in tokens]
    
    # we also exclude stop words - all sorts of prepositions, conjunctions, etc.
    normed_tokens = [word for word in normed_tokens if word not in nltk.corpus.stopwords.words("russian")]
    normed_tokens = [word for word in normed_tokens if word not in nltk.corpus.stopwords.words("english")]
    
    # and punctuation marks
    normed_tokens = [word for word in normed_tokens if word not in punctuation]
    return ' '.join(normed_tokens)

In [3]:
import os
import pandas as pd

# prepare an empty data frame
df = pd.DataFrame(columns=['text', 'class'])

# these are folders in which files with texts
dir0 = "data/Bulgakov/"
dir1 = "data/Dostoevsky/"

In [4]:
# consider all our texts in a data frame indicating the class
for filename in os.listdir(dir0):
    with open(os.path.join(dir0, filename), encoding='utf8', errors='ignore') as file:
        contents = lemmatize(file.read())
    df = df.append(pd.Series({'text': contents, 'class': 0}), ignore_index=True)

In [5]:
# and for the second folder too
for filename in os.listdir(dir1):
    with open(os.path.join(dir1, filename), encoding='utf8', errors='ignore') as file:
        contents = lemmatize(file.read())
    df = df.append(pd.Series({'text': contents, 'class': 1}), ignore_index=True)

In [6]:
df

Unnamed: 0,text,class
0,принимать внимание ежедневно дарья пётр закупа...,0
1,царство небесный настоящий личность барский по...,0
2,выучить « главрыба » угол мохов « б » подбегат...,0
3,весь предмет помещаться маленький мраморный ст...,0
4,зимой пойти бить сапог бить кирпич ребро получ...,0
5,шарик начать учиться цвета лишь исполниться че...,0
6,далёкий идти пузатый двубокать дрянь неизвестн...,0
7,разрисовать райский цвета тарелка чёрный широк...,0
8,глаз менее день заливаться благодарный слеза а...,0
9,учиться читать совершенно мясо пахнуть верста ...,0


In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['class'], test_size=0.4, stratify=df['class'])

## Bag-of-Words-embedding

Of course, mathematical methods are not able to work with clear text. It is time to get embeddings!

In [8]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

bof_vect = CountVectorizer()
bof_vect.fit(np.hstack([X_train, X_test]))
bof_train = bof_vect.transform(X_train)
bof_test = bof_vect.transform(X_test)

In [9]:
bof_train.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [10]:
bof_train.toarray().shape

(20, 1051)

In [20]:
r_mean_bof = np.sum(bof_train[y_train.equals(0)], axis=0)
r_mean_bof.shape

(1, 1051)

In [21]:
l_mean_bof = np.sum(bof_train[y_train.equals(1)], axis=0)

In [22]:
from scipy.spatial.distance import cosine
bof_r = np.apply_along_axis(cosine, 1, bof_test.toarray(), v=r_mean_bof)
bof_l = np.apply_along_axis(cosine, 1, bof_test.toarray(), v=l_mean_bof)

bof_results = pd.DataFrame([
    bof_r,
    bof_l,
    np.maximum(bof_r, bof_l) == bof_l,
    y_test], index=["ruki", "leningrad", "predict", "class"]).T.astype(np.float)
bof_results

Unnamed: 0,ruki,leningrad,predict,class
0,0.95286,0.95286,1.0,0.0
1,0.963845,0.963845,1.0,1.0
2,0.879239,0.879239,1.0,1.0
3,0.965497,0.965497,1.0,1.0
4,0.942896,0.942896,1.0,0.0
5,0.93258,0.93258,1.0,1.0
6,0.958655,0.958655,1.0,0.0
7,0.979328,0.979328,1.0,1.0
8,1.0,1.0,1.0,0.0
9,0.947777,0.947777,1.0,1.0


In [23]:
print(X_test)

4     зимой пойти бить сапог бить кирпич ребро получ...
28    известно иван фёдор епанчин — человек образова...
32    лицо молодой человек приятный тонкий сухой бес...
26    генерал епанчин жить собственный свой дом неск...
11    брат пёс отведать изолировать проволока чистый...
31    чашка кофей выпиваться барышня ещё ранний ровн...
9     учиться читать совершенно мясо пахнуть верста ...
20    немой довольно широкий толстый плащ рукав огро...
2     выучить « главрыба » угол мохов « б » подбегат...
23    лёт генерал епанчин ещё говориться самый сок п...
29    везти карта играть чрезвычайно большой намерен...
6     далёкий идти пузатый двубокать дрянь неизвестн...
5     шарик начать учиться цвета лишь исполниться че...
8     глаз менее день заливаться благодарный слеза а...
Name: text, dtype: object


In [24]:
from sklearn.metrics import accuracy_score
accuracy_score(bof_results['predict'], bof_results['class'])

0.5

In [25]:
from sklearn.ensemble import RandomForestClassifier
RandomForestClassifier().fit(bof_train.toarray(), y_train.tolist()).score(bof_test.toarray(), y_test.tolist())



0.5

## TF-IDF-embedding

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(np.hstack([X_train, X_test]))
tfidf_train = tfidf_vect.transform(X_train)
tfidf_test = tfidf_vect.transform(X_test)

In [28]:
tfidf_train.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [29]:
tfidf_train.toarray().shape

(20, 1051)

In [30]:
r_mean_tfidf = np.sum(tfidf_train[y_train.equals(0)], axis=0)
r_mean_tfidf.shape

(1, 1051)

In [31]:
l_mean_tfidf = np.sum(tfidf_train[y_train.equals(1)], axis=0)

In [32]:
tfidf_r = np.apply_along_axis(cosine, 1, tfidf_test.toarray(), v=r_mean_tfidf)
tfidf_l = np.apply_along_axis(cosine, 1, tfidf_test.toarray(), v=l_mean_tfidf)

tfidf_results = pd.DataFrame([
    tfidf_r,
    tfidf_l,
    np.maximum(tfidf_r, tfidf_l) == tfidf_l,
    y_test], index=["ruki", "leningrad", "predict", "class"]).T.astype(np.float)
tfidf_results

Unnamed: 0,ruki,leningrad,predict,class
0,0.989062,0.989062,1.0,0.0
1,0.975114,0.975114,1.0,1.0
2,0.959948,0.959948,1.0,1.0
3,0.976519,0.976519,1.0,1.0
4,0.972807,0.972807,1.0,0.0
5,0.946998,0.946998,1.0,1.0
6,0.978618,0.978618,1.0,0.0
7,0.98027,0.98027,1.0,1.0
8,1.0,1.0,1.0,0.0
9,0.980662,0.980662,1.0,1.0


In [33]:
accuracy_score(tfidf_results['predict'], tfidf_results['class'])

0.5

In [34]:
from sklearn.ensemble import RandomForestClassifier
RandomForestClassifier().fit(tfidf_train.toarray(), y_train.tolist()).score(tfidf_test.toarray(), y_test.tolist())



0.6428571428571429

## Word2vec-embedding

Since w2v is not a sklearn classifier, it will output data of a slightly different type at the output, and this will need to be taken into account in future work.

In [35]:
from gensim.models import Word2Vec

X_train_w2v = X_train.apply(str.split)
X_test_w2v = X_test.apply(str.split)
w2v_vect = Word2Vec(np.hstack([X_train_w2v, X_test_w2v]), size=40, min_count=5)

In [36]:
X_train_w2v

30    [правда, характер, весьма, часто, слушаться, п...
21    [особенно, приметный, это, лицо, мёртвый, блед...
14    [далёкий, идти, пузатый, двубокать, дрянь, неи...
16    [это, дело, любитель, –, весь, равно, калоша, ...
10    [полдень, угостить, колпак, кипяток, стемнеть,...
24    [весь, девица, епанчин, барышня, здоровый, цве...
3     [весь, предмет, помещаться, маленький, мраморн...
25    [кроме, чай, кофей, сыр, мёд, масло, особый, о...
19    [ведать, судья, происходить, душа, иван, фёдор...
15    [тело, изломанный, битый, надругаться, человек...
22    [ещё, петербургский, уезд, какой-то, фабрика, ...
7     [разрисовать, райский, цвета, тарелка, чёрный,...
12    [играть, гармоника, пахнуть, сосиска, буква, б...
0     [принимать, внимание, ежедневно, дарья, пётр, ...
13    [играть, гармоника, пахнуть, сосиска, буква, б...
1     [царство, небесный, настоящий, личность, барск...
17    [хотя, ещё, накануне, предчувствовать, именно,...
27    [вагон, третье, класс, рассвет, очутиться,

You can do various interesting things with Word-Tu-Century. For example, with the following command, we can display the words that turned out to be closest in value to the given word in the training set.

In [37]:
w2v_vect.most_similar(positive="пёс")

  """Entry point for launching an IPython kernel.


[('широкий', 0.30680638551712036),
 ('–', 0.20665977895259857),
 ('половина', 0.20610211789608002),
 ('друг', 0.20218273997306824),
 ('сыр', 0.1472751349210739),
 ('весь', 0.1371825486421585),
 ('свой', 0.12968699634075165),
 ('первое', 0.12856897711753845),
 ('случай', 0.09170875698328018),
 ('правда', 0.07748904079198837)]

In [38]:
w2v_vect.most_similar(negative=["пёс"])

  """Entry point for launching an IPython kernel.


[('самый', 0.3904285132884979),
 ('слово', 0.21938388049602509),
 ('это', 0.15510502457618713),
 ('лицо', 0.1491074562072754),
 ('хвост', 0.13058458268642426),
 ('оба', 0.12605684995651245),
 ('кроме', 0.11542735993862152),
 ('—', 0.1143377274274826),
 ('епанчин', 0.09248550236225128),
 ('«', 0.07219742983579636)]

Преобразуем тексты песен в вектора - возьмем сумму векторов всех слов, которые входят в песню

In [39]:
def text2vec(text):
    # We average the word vectors
    vecs = []
    for word in text:
        try:
            vecs.append(w2v_vect[word])
        except KeyError:
            pass
    return np.sum(vecs, axis=0) / len(vecs)

w2v_train = X_train_w2v.apply(text2vec)
w2v_test = X_test_w2v.apply(text2vec)
w2v_train

  


30    [0.0020051587, -0.0006808024, 0.0023217082, -0...
21    [0.0036189307, 0.0026274037, 0.0017450843, -0....
14    [0.00062815216, 0.005071218, 0.0028173197, -0....
16    [0.0027219453, 0.00015778514, 0.0013914456, 0....
10    [0.0043437425, 0.0029992529, 0.0031090537, 0.0...
24    [-8.7817316e-05, -0.00048067002, 0.005580824, ...
3     [0.0052567385, 0.00019174663, 0.0018956842, 0....
25    [0.0062761335, 0.0032825968, 0.002430827, 0.00...
19    [-0.00049060123, -0.00036905013, -0.0015041415...
15    [0.006450089, -0.0022294924, 1.9380124e-05, -0...
22    [0.0025990412, 0.0011180022, 0.0010021958, 0.0...
7     [0.0006040912, -0.0011425489, 0.0032773723, -0...
12    [0.0056627505, 0.0015902696, 0.0035911298, -0....
0     [0.0052800537, 0.006135592, 0.004443416, -0.00...
13    [0.0056627505, 0.0015902696, 0.0035911298, -0....
1     [0.005046911, 0.0034308673, 0.0050926856, 0.00...
17    [0.0021758946, 0.0028172277, 0.0040840674, 0.0...
27    [0.0027608175, -0.0027356942, 0.0031221774

In [40]:
w2v_train.shape

(20,)

In [41]:
w2v_train[0]

array([ 5.2800537e-03,  6.1355918e-03,  4.4434159e-03, -2.3759922e-03,
       -2.8852476e-03,  1.0198847e-03, -2.5366426e-03, -3.2801889e-03,
       -3.7500265e-03, -3.7477454e-03,  3.3762045e-03, -2.3114053e-03,
       -3.6823575e-04,  7.1259780e-04, -3.7706990e-03, -1.6813326e-03,
        1.1189969e-03,  2.8899021e-03,  5.2827261e-03, -1.0071065e-03,
        2.5704191e-03,  4.6943624e-05, -1.1923875e-03,  2.1157840e-03,
        1.7035749e-03, -6.2530213e-03, -1.7998546e-03, -4.3732221e-03,
        7.8334320e-05, -2.2844190e-03, -2.2270882e-03, -4.1759899e-04,
        1.4266969e-03,  3.1164780e-03, -6.9710556e-03,  3.7443030e-03,
       -4.6109534e-03, -2.2606216e-03, -3.0501306e-03,  3.4615404e-03],
      dtype=float32)

In [42]:
w2v_train = np.dstack(w2v_train)[0]
w2v_train.shape

(40, 20)

In [43]:
w2v_test = np.dstack(w2v_test)[0]

In [44]:
r_mean_w2v = np.sum(w2v_train[:, y_train == 0], axis=1)
r_mean_w2v.shape

(40,)

In [45]:
l_mean_w2v = np.sum(w2v_train[:, y_train == 1], axis=1)

In [47]:
r_mean_w2v

array([ 0.04165722,  0.01779496,  0.02922862, -0.00677199, -0.01690252,
        0.00222621, -0.00319195, -0.04988823,  0.00041088, -0.01605525,
       -0.00628722, -0.00379488, -0.01941233, -0.01479384, -0.00252264,
       -0.03452507, -0.00341374,  0.00763343,  0.02131335, -0.02114879,
       -0.01616263,  0.00354776, -0.00084967, -0.00097331,  0.01461923,
       -0.0175407 , -0.00566027, -0.01091546, -0.00244835, -0.01284909,
        0.00571721,  0.00596794, -0.0139887 , -0.00713897, -0.01410007,
        0.00848586, -0.03263284,  0.00109359,  0.00618847,  0.02020013],
      dtype=float32)

In [48]:
l_mean_w2v

array([ 0.03639153,  0.00811981,  0.02334082,  0.00578581, -0.01950352,
       -0.0032452 ,  0.02410904, -0.01117448,  0.01246825, -0.00154648,
       -0.01920532,  0.00265025,  0.00625496, -0.00479666, -0.00172858,
       -0.01270933,  0.0049363 ,  0.01996669,  0.0239169 , -0.01430546,
       -0.0011006 ,  0.00490209, -0.00260028, -0.00401581,  0.01536079,
        0.01576239,  0.01426886, -0.0356225 , -0.02328332, -0.00462142,
        0.01079347, -0.03714938, -0.00253065, -0.01166139,  0.00425046,
       -0.01020377, -0.02850923,  0.00339461,  0.00663321, -0.00440668],
      dtype=float32)

In [49]:
from sklearn.metrics import accuracy_score
from scipy.spatial.distance import cosine


w2v_r = np.apply_along_axis(cosine, 0, w2v_test, v=r_mean_w2v)
w2v_l = np.apply_along_axis(cosine, 0, w2v_test, v=l_mean_w2v)

w2v_results = pd.DataFrame([
    w2v_r,
    w2v_l,
    np.maximum(w2v_r, w2v_l) == w2v_l,
    y_test], index=["r", "l", "predict", "class"]).T.astype(np.float)
w2v_results

Unnamed: 0,r,l,predict,class
0,0.724402,0.629545,0.0,0.0
1,0.828942,0.425812,0.0,1.0
2,0.785144,0.468837,0.0,1.0
3,0.786409,0.687019,0.0,1.0
4,0.328945,0.66129,1.0,0.0
5,0.712283,0.551921,0.0,1.0
6,0.591634,0.795179,1.0,0.0
7,0.787078,0.98796,1.0,1.0
8,0.650896,0.951145,1.0,0.0
9,0.790382,0.366508,0.0,1.0


In [50]:
accuracy_score(w2v_results['predict'], w2v_results['class'])

0.21428571428571427

In [51]:
from sklearn.ensemble import RandomForestClassifier
RandomForestClassifier().fit(w2v_train.T, y_train.tolist()).score(w2v_test.T, y_test.tolist())



0.8571428571428571