# Text classification using CNN

Берем отзывы за лето (из архива с материалами или предыдущего занятия)

Учим conv сеть для классификации
Рассмотреть 2-а варианта сеточек:
Инициализировать слой tf.keras.layers.Embedding по умолчанию (ну то есть вам ничего не делать с весами)
Инициализировать tf.keras.layers.Embedding предобученными векторами взять к примеру с https://rusvectores.org/ru/
Сравнить две архитектуры с предобученными весами и когда tf.keras.layers.Embedding обучается сразу со всей сеточкой, что получилось лучше

**rusvectores works with ukranian vocab nowadays. So let's try to work with the same 5-class feedback data through the classification models, given in the lesson**

In [1]:
from sklearn.metrics import *
from sklearn.model_selection import train_test_split

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_excel("Otzyvy.xls")

In [4]:
df.head(2)

Unnamed: 0,Rating,Content,Date
0,5,It just works!,2017-08-14
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20659 entries, 0 to 20658
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Rating   20659 non-null  int64 
 1   Content  20656 non-null  object
 2   Date     20659 non-null  object
dtypes: int64(1), object(2)
memory usage: 484.3+ KB


In [6]:
# checking classes
df['Rating'].value_counts()

Rating
5    14586
1     2276
4     2138
3      911
2      748
Name: count, dtype: int64

### Предобработка

In [7]:
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re

In [8]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)


In [9]:
df['processed'] = df['Content'].apply(preprocess_text)

In [10]:
df.head(10)

Unnamed: 0,Rating,Content,Date,processed
0,5,It just works!,2017-08-14,it just works
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14,целое удобноной приложениеиз минус хотеть боль...
2,5,Отлично все,2017-08-14,отлично
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14,зависать 1 работа антивирус ранее пользоваться...
4,5,"Очень удобно, работает быстро.",2017-08-14,удобно работать быстро
5,5,Всё удобно норм 👍👍👍,2017-08-14,удобно норма 👍👍👍
6,5,Очень удобное приложение.,2017-08-14,удобный приложение
7,5,Все устраивает,2017-08-14,устраивать
8,5,У меня работает все четко. В отличии от банком...,2017-08-14,работать чётко отличие банкомат вечно зависать...
9,5,Очень все хорошо👍,2017-08-14,хорошо👍


In [11]:
# the longest tweet length
max(len(df['processed'].iloc[x].split()) for x in range(len(df['processed'])))

132

In [12]:
# check percentiles higher than 99 to find an optimum number of tokens per tweet
np.percentile([len(df['processed'].iloc[x].split()) for x in range(len(df['processed']))], 99.4)

40.0

In [13]:
max_words = 500
max_len = 40
num_classes = 6

# Training
epochs = 15
batch_size = 512
print_batch_n = 100

In [14]:
df_tweets = df['processed']
df_labels = df['Rating']

In [15]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")

tokens = word_tokenize(" ".join(df_tweets))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Armik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Отфильтруем данные

и соберём в корпус N наиболее частых токенов

In [16]:
tokens_filtered = [word for word in tokens if word.isalnum()]

In [17]:
from nltk.probability import FreqDist
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]

In [18]:
tokens_filtered_top[:10]

['приложение',
 'удобно',
 'работать',
 'удобный',
 'отлично',
 'нравиться',
 'хороший',
 'отличный',
 'телефон',
 'супер']

In [19]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}

In [20]:
def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [21]:
x_train, x_test, y_train, y_test = train_test_split(df_tweets, df_labels)

In [22]:
x_train_seq = np.asarray([text_to_sequence(text, max_len) for text in x_train], dtype=np.int32)
x_test_seq = np.asarray([text_to_sequence(text, max_len) for text in x_test], dtype=np.int32)

In [23]:
x_train_seq.shape

(15494, 40)

In [24]:
max_len

40

In [25]:
x_train_seq[1]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0, 338, 357, 163, 111,   3, 412,  27,
        35])

# Keras model

In [26]:
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D, Conv2D
from keras.preprocessing.text import Tokenizer
#from keras.preprocessing.sequence import pad_sequences
from keras.utils import pad_sequences
from keras.callbacks import TensorBoard 
from keras.metrics import categorical_crossentropy
from keras.callbacks import EarlyStopping  

In [27]:
y_train.value_counts()

Rating
5    10942
1     1720
4     1595
3      688
2      549
Name: count, dtype: int64

In [28]:
y_test.value_counts()

Rating
5    3644
1     556
4     543
3     223
2     199
Name: count, dtype: int64

In [29]:
y_train_categ = keras.utils.to_categorical(y_train-1, num_classes)
y_test_categ = keras.utils.to_categorical(y_test-1, num_classes)

In [30]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [31]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [32]:
x_train_seq

array([[  0,   0,   0, ...,   0,   4,   1],
       [  0,   0,   0, ..., 412,  27,  35],
       [  0,   0,   0, ...,   0,   5,   3],
       ...,
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   8, 303,   2],
       [  0,   0,   0, ...,  63,  55,  77]])

In [33]:
y_train_categ

array([[0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       ...,
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0.]], dtype=float32)

In [34]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(x_train_seq, y_train_categ,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15


In [35]:
score = model.evaluate(x_test_seq, y_test_categ, batch_size=batch_size, verbose=1)
print('\n')
print(score)



[0.6555774211883545, 0.7699903249740601]


In [36]:
results = model.predict(x_test_seq, batch_size=batch_size, verbose=1)



In [37]:
test_classes = [np.argmax(x)+1 for x in results]

In [38]:
df_test_classes = pd.DataFrame({'true': y_test.values, 'pred': test_classes}, index=y_test.index, columns=['true', 'pred'])

In [39]:
def extracting_classes(results):
    classes = [np.argmax(x) for x in results]
    return classes

In [40]:
df_test_classes['accuracy_tag'] = 0
df_test_classes.loc[df_test_classes['true']==df_test_classes['pred'], ['accuracy_tag']] = 1

In [41]:
def get_accuracy(df, class_name):
    accuracy_tags = df.loc[(df['true']==class_name), ['accuracy_tag']].value_counts()
    false_tags = df.loc[(df['true']==class_name), ['pred']].value_counts()
    accuracy_percent = accuracy_tags[1] / (accuracy_tags[1]+accuracy_tags[0])
    return false_tags, accuracy_percent

In [42]:
df_test_classes['pred'].value_counts().index

Index([5, 1, 4, 3, 2], dtype='int64', name='pred')

In [43]:
for class_name in df_test_classes['pred'].value_counts().index:
    false_tags, g_a = get_accuracy(df_test_classes, class_name)
    print(f'Accuracy for {class_name} class is {g_a} %')
    if g_a <= 0.4:
        print(f'Number of predicted class tags for {class_name} class: \n {false_tags}')

Accuracy for 5 class is 0.9401756311745335 %
Accuracy for 1 class is 0.7913669064748201 %
Accuracy for 4 class is 0.16574585635359115 %
Number of predicted class tags for 4 class: 
 pred
5       334
1        96
4        90
3        23
Name: count, dtype: int64
Accuracy for 3 class is 0.08968609865470852 %
Number of predicted class tags for 3 class: 
 pred
1       107
4        56
5        40
3        20
Name: count, dtype: int64
Accuracy for 2 class is 0.005025125628140704 %
Number of predicted class tags for 2 class: 
 pred
1       124
5        33
4        22
3        19
2         1
Name: count, dtype: int64


**Result:**
*not bad, if we take into consideration the distance of true tags from the falsely predicted class tags  
For class 4 for the most frequent predicted class by far is 5, for 2 is 1.   
The worst prediction quality is for class with the tag 3.   
The classes originally are very much unbalanced so that might be the reason*

**TFidf Vectorizer with Logistic regression**

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score 

In [45]:
vect = TfidfVectorizer(ngram_range=(1, 2), analyzer='word', lowercase=False)

In [46]:
x_train

13372                                   удобный приложение
17481    samsung galaxy note 4 плохо работать pay писат...
7183                                      отлично работать
14546                       отличный приложениемень удобно
2209     раскрасить мера заходить приложение банк нафиг...
                               ...                        
20534    последний грузиться программаа таква целое нор...
20588    бесполезный приложениедаженя открываетсясвобод...
5170                                                   отл
5523                   отличный предложение удобно вовсема
14297    обновление приложение выдать ошибкучтый устано...
Name: processed, Length: 15494, dtype: object

In [47]:
train_ft = vect.fit_transform(x_train)
test_ft = vect.transform(x_test)

In [48]:
lgr = LogisticRegression()

In [49]:
lgr.fit(train_ft, y_train.to_numpy())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [50]:
y_pred = lgr.predict(test_ft)

In [51]:
accuracy_score(y_test.to_numpy(), y_pred)

0.7686350435624395

In [52]:
tfidf_test_classes = pd.DataFrame({'true': y_test.values, 'pred': y_pred}, index=y_test.index, columns=['true', 'pred'])

In [53]:
tfidf_test_classes['accuracy_tag'] = 0
tfidf_test_classes.loc[tfidf_test_classes['true']==tfidf_test_classes['pred'], ['accuracy_tag']] = 1

In [54]:
tfidf_test_classes.head()

Unnamed: 0,true,pred,accuracy_tag
5575,5,5,1
8892,1,5,0
3074,5,5,1
11922,5,5,1
4082,5,5,1


In [55]:
for class_name in tfidf_test_classes['true'].value_counts().index:
    false_tags, g_a = get_accuracy(tfidf_test_classes, class_name)
    print(f'Accuracy for {class_name} class is {g_a} %')
    if g_a <= 0.4:
        print(f'Number of predicted class tags for {class_name} class: \n {false_tags}')

Accuracy for 5 class is 0.960482985729967 %
Accuracy for 1 class is 0.7068345323741008 %
Accuracy for 4 class is 0.12154696132596685 %
Number of predicted class tags for 4 class: 
 pred
5       393
1        69
4        66
3        15
Name: count, dtype: int64
Accuracy for 3 class is 0.04035874439461883 %
Number of predicted class tags for 3 class: 
 pred
5       88
1       82
4       44
3        9
Name: count, dtype: int64
Accuracy for 2 class is 0.010050251256281407 %
Number of predicted class tags for 2 class: 
 pred
1       116
5        58
4        13
3        10
2         2
Name: count, dtype: int64


In [56]:
from gensim.models import Word2Vec

In [57]:
modelW2V = Word2Vec(sentences=x_train.apply(str.split), vector_size=100, window=5, min_count=5, workers=8)

In [58]:
modelW2V.wv.key_to_index

{'приложение': 0,
 'удобно': 1,
 'работать': 2,
 'удобный': 3,
 'отлично': 4,
 'нравиться': 5,
 'отличный': 6,
 'хороший': 7,
 'телефон': 8,
 'супер': 9,
 'быстро': 10,
 'обновление': 11,
 'мочь': 12,
 'пароль': 13,
 'антивирус': 14,
 'пользоваться': 15,
 'банк': 16,
 'устраивать': 17,
 'сбербанк': 18,
 'вход': 19,
 'раз': 20,
 'карта': 21,
 'прошивка': 22,
 'проблема': 23,
 'рута': 24,
 'перевод': 25,
 'сделать': 26,
 'программа': 27,
 'разработчик': 28,
 'счёт': 29,
 'писать': 30,
 'норма': 31,
 'деньга': 32,
 'приходиться': 33,
 'вводить': 34,
 'ошибка': 35,
 'нормально': 36,
 'постоянно': 37,
 'платёж': 38,
 'довольный': 39,
 'около': 40,
 'исправить': 41,
 'понятно': 42,
 'смс': 43,
 'шаблон': 44,
 'код': 45,
 'свой': 46,
 'вылетать': 47,
 'зайти': 48,
 'функция': 49,
 'стать': 50,
 'последний': 51,
 'право': 52,
 'мобильный': 53,
 'возможность': 54,
 'иня': 55,
 'приходить': 56,
 'делать': 57,
 'заходить': 58,
 'установить': 59,
 'класс': 60,
 'проверка': 61,
 'работа': 62,
 'опл

In [59]:
vect_idf = TfidfVectorizer()
vect_idf.fit_transform(x_train)
tfidf = dict(zip(vect_idf.get_feature_names_out(), vect_idf.idf_))

In [60]:
tfidf

{'00': 9.955125489661903,
 '000': 9.955125489661903,
 '000р': 9.955125489661903,
 '005': 9.955125489661903,
 '01': 9.955125489661903,
 '01072017ич': 9.955125489661903,
 '05': 9.549660381553739,
 '07': 9.955125489661903,
 '07082017года': 9.955125489661903,
 '0писать': 9.955125489661903,
 '0тлично': 9.955125489661903,
 '0хотеть': 9.955125489661903,
 '10': 6.736249664793703,
 '100': 7.214285465736702,
 '1000': 9.261978309101957,
 '10000': 9.955125489661903,
 '100500': 9.955125489661903,
 '100817': 9.955125489661903,
 '100иня': 9.955125489661903,
 '100ич': 9.955125489661903,
 '100раз': 9.955125489661903,
 '100шт': 9.955125489661903,
 '101': 9.955125489661903,
 '1010': 9.038834757787749,
 '1015': 9.955125489661903,
 '1015м': 9.955125489661903,
 '1015ь': 9.955125489661903,
 '108ич': 9.955125489661903,
 '10го': 9.955125489661903,
 '10гоести': 9.955125489661903,
 '10копейка': 9.955125489661903,
 '10раз': 9.955125489661903,
 '11062017': 9.955125489661903,
 '11111': 9.955125489661903,
 '11тр': 9

In [61]:
rt = vect_idf.vocabulary_.items()

In [62]:
tfidf['антивирус']

4.5345904903896175

In [63]:
vect_idf.idf_[vect_idf.vocabulary_['антивирус']]

4.5345904903896175

In [64]:
len(tfidf)

10943

In [65]:
from collections import defaultdict

In [66]:
max_idf = max(vect_idf.idf_)

word2weight = defaultdict(
    lambda: max_idf,
    [(w, vect_idf.idf_[i]) for w, i in vect_idf.vocabulary_.items()])

In [67]:
def get_vect_mean(txt):
    vector_w2v = np.zeros(100)
    n_w2v = 0
    for wrd in txt.split():
        if wrd in word2weight:
            vector_w2v += word2weight[wrd]*1
            n_w2v += 1
    if n_w2v > 0:
        vector_w2v = vector_w2v / n_w2v
    return vector_w2v

def get_vect_idf(txt):
    vector_w2v = np.zeros(100)
    n_w2v = 0
    for wrd in txt.split():
        if wrd in word2weight:
            iddf_ = tfidf.get(wrd, 1.)
            vector_w2v += word2weight[wrd]*iddf_
            n_w2v += iddf_
    if n_w2v > 0:
        vector_w2v = vector_w2v / n_w2v
    return vector_w2v

In [68]:
from tqdm.notebook import tqdm

In [69]:
arr_vect = []
for txt in tqdm(x_train):
    arr_vect.append(get_vect_mean(txt))
    
arr_vect_test = []
for txt in tqdm(x_test):
    arr_vect_test.append(get_vect_mean(txt))
    
train_w2v = np.asarray(arr_vect)    
test_w2v = np.asarray(arr_vect_test)

  0%|          | 0/15494 [00:00<?, ?it/s]

  0%|          | 0/5165 [00:00<?, ?it/s]

In [70]:
len(train_w2v)

15494

In [71]:
lgr_w2v = LogisticRegression()

In [72]:
lgr_w2v.fit(train_w2v, y_train.to_numpy())

In [73]:
y_pred = lgr_w2v.predict(test_w2v)

In [74]:
accuracy_score(y_test.to_numpy(), y_pred)

0.6964181994191675

In [75]:
arr_vect = []
for txt in tqdm(x_train):
    arr_vect.append(get_vect_idf(txt))
    
arr_vect_test = []
for txt in tqdm(x_test):
    arr_vect_test.append(get_vect_idf(txt))
    
train_w2v = np.asarray(arr_vect)    
test_w2v = np.asarray(arr_vect_test)

  0%|          | 0/15494 [00:00<?, ?it/s]

  0%|          | 0/5165 [00:00<?, ?it/s]

In [76]:
lgr_w2v = LogisticRegression()
lgr_w2v.fit(train_w2v, y_train.to_numpy())
y_pred = lgr_w2v.predict(test_w2v)

In [77]:
accuracy_score(y_test.to_numpy(), y_pred)

0.6985479186834462