# Введение в обработку естественного языка

## Урок 2. Создание признакового пространства

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score

## Данные Twitter

In [2]:
TWITTER_PROCESSED_PATH = "../data/tweets.pkl.gz"

In [3]:
twitter_df = pd.read_pickle(TWITTER_PROCESSED_PATH)
twitter_df.tail(3)

Unnamed: 0,id,label,tweet,text,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
49156,49157,,#hillary #campaigned today in #ohio((omg)) &am...,hillary campaigned today in ohio omg amp used ...,"[hillary, campaigned, today, in, ohio, omg, am...","[hillary, campaigned, today, ohio, omg, amp, u...","[hillari, campaign, today, ohio, omg, amp, use...","[hillary, campaigned, today, ohio, omg, amp, u..."
49157,49158,,"happy, at work conference: right mindset leads...",happy at work conference right mindset leads t...,"[happy, at, work, conference, right, mindset, ...","[happy, work, conference, right, mindset, lead...","[happi, work, confer, right, mindset, lead, cu...","[happy, work, conference, right, mindset, lead..."
49158,49159,,"my song ""so glad"" free download! #shoegaze ...",my song so glad free download shoegaze newmusi...,"[my, song, so, glad, free, download, shoegaze,...","[song, glad, free, download, shoegaze, newmusi...","[song, glad, free, download, shoegaz, newmus, ...","[song, glad, free, download, shoegaze, newmusi..."


### Задание 1. Создать BoW с помощью CountVectorizer

Применим векторайзер к 'tweet_stemmed' и 'tweet_lemmatized' отдельно.
-	Игнорируем слова, частота которых в документе строго превышает порог 0.9 с помощью max_df.
-	Ограничим количество слов, попадающий в мешок, с помощью max_features = 1000.
-	Исключим стоп-слова с помощью stop_words='english'. 
-	Отобразим Bag-of-Words модель как DataFrame. columns необходимо извлечь с помощью CountVectorizer.get_feature_names().

In [4]:
count_vectorizer = CountVectorizer(
    max_features = 1000,
    ngram_range=(1,1),
    analyzer='word', 
    binary=False,
    preprocessor=lambda x: x,
    tokenizer=lambda x: x,
    max_df=0.9
    #stop_words='english'  # уже сделано на этапе препроцессинга
)

In [5]:
documents_stemmed = twitter_df['tweet_stemmed']
bow_stemmed = count_vectorizer.fit_transform(documents_stemmed)
words_stemmed = pd.DataFrame(bow_stemmed.toarray(),
                             columns = count_vectorizer.get_feature_names())
words_stemmed.head(5)

Unnamed: 0,abl,absolut,accept,account,act,action,actor,actual,ad,adapt,...,year,yesterday,yet,yo,yoga,york,young,youtub,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
documents_lemmatized = twitter_df['tweet_lemmatized']
bow_lemmatized = count_vectorizer.fit_transform(documents_lemmatized)
words_lemmatized = pd.DataFrame(bow_lemmatized.toarray(),
                                columns = count_vectorizer.get_feature_names())
words_lemmatized.head(5)

Unnamed: 0,able,absolutely,account,act,action,actor,actually,adapt,add,adventure,...,yes,yesterday,yet,yo,yoga,york,young,youtube,yr,yummy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Задание 2. Создать BoW с помощью TfidfVectorizer

In [7]:
# Аналогично заданию 1, но ради разнообразия восстановим сначала документы из списков токенов.
documents_stemmed = documents_stemmed.apply(lambda tokens: ' '.join(tokens))
documents_lemmatized = documents_lemmatized.apply(lambda tokens: ' '.join(tokens))

In [8]:
tfidf_vectorizer = TfidfVectorizer(
    max_features = 1000,
    max_df=0.9,
    stop_words='english'
)

In [9]:
tfidf_stemmed = tfidf_vectorizer.fit_transform(documents_stemmed)
words_stemmed = pd.DataFrame(tfidf_stemmed.toarray(),
                             columns = tfidf_vectorizer.get_feature_names())
words_stemmed.head(5)

Unnamed: 0,abl,absolut,accept,account,act,action,actor,actual,ad,adapt,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
tfidf_lemmatized = tfidf_vectorizer.fit_transform(documents_lemmatized)
words_lemmatized = pd.DataFrame(tfidf_lemmatized.toarray(),
                             columns = tfidf_vectorizer.get_feature_names())
words_lemmatized.head(5)

Unnamed: 0,able,absolutely,account,act,action,actor,actually,adapt,add,adventure,...,year,yes,yesterday,yo,yoga,york,young,youtube,yr,yummy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Задание 3.

3. Проверьте ваши векторайзеры на корпусе который использовали на вебинаре, составьте таблицу метод векторизации и скор который вы получили (в методах векторизации по изменяйте параметры что бы добиться лучшего скора) обратите внимание как падает/растёт скор при уменьшении количества фичей, и изменении параметров, так же попробуйте применить к векторайзерам PCA для сокращения размерности посмотрите на качество сделайте выводы

**Примечание:** не совсем понятно, о каком корпусе идет речь, но раз продолжаем работать с базой твитов, то в качестве корпуса используем train-часть подготовленной в результате практической работы №1 таблицы twitter_df.

В качестве PCA используем TruncatedSVD, как как чистый PCA не работает со sparse.

In [11]:
documents = documents_lemmatized[~twitter_df.label.isna()]
labels = twitter_df.label[~twitter_df.label.isna()].astype(int)
train_x, valid_x, train_y, valid_y = train_test_split(documents, labels, random_state=100, shuffle=True, stratify=labels)

In [12]:
def get_performance(y_true, y_pred):
    return (
        accuracy_score(y_true, y_pred),
        balanced_accuracy_score(y_true, y_pred),
        f1_score(y_true, y_pred)
    )

In [13]:
# Инициализируем базовые метрики: всегда предсказываем ноль
model_performance = {'Baseline: all zeros': get_performance(valid_y, np.zeros(valid_y.size))}

In [14]:
model_grid = {    
    'TF-IDF': make_pipeline(
        TfidfVectorizer(), LogisticRegression()
    ),
    'TF-IDF (100 features)': make_pipeline(
        TfidfVectorizer(max_features=100), LogisticRegression()
    ),
    'TF-IDF (1000 features)': make_pipeline(
        TfidfVectorizer(max_features=1000), LogisticRegression()
    ),
    'TF-IDF (5000 features)': make_pipeline(
        TfidfVectorizer(max_features=5000), LogisticRegression()
    ),
    'TF-IDF (10000 features)': make_pipeline(
        TfidfVectorizer(max_features=10000), LogisticRegression()
    ),
    'Count Vectorizer': make_pipeline(
        CountVectorizer(), LogisticRegression()
    ),    
    'Count Vectorizer (1000 features)': make_pipeline(
        CountVectorizer(max_features=1000), LogisticRegression()
    ),
    'Count Vectorizer (5000 features)': make_pipeline(
        CountVectorizer(max_features=5000), LogisticRegression()
    ),
    'Count Vectorizer (10000 features)': make_pipeline(
        CountVectorizer(max_features=10000), LogisticRegression()
    ),
    'Count Vectorizer (15000 features)': make_pipeline(
        CountVectorizer(max_features=15000), LogisticRegression()
    ),
    'Count Vectorizer + TruncatedSVD(150)': make_pipeline(
        CountVectorizer(), TruncatedSVD(150), LogisticRegression()
    ),
    'Count Vectorizer + TruncatedSVD(500)': make_pipeline(
        CountVectorizer(), TruncatedSVD(500), LogisticRegression()
    ),
    'Count Vectorizer (ngrams: 1,2)': make_pipeline(
        CountVectorizer(ngram_range=(1,2)), LogisticRegression()
    ),
    'Count Vectorizer, binary': make_pipeline(
        CountVectorizer(binary=True), LogisticRegression()
    ),
}

In [15]:
%%time
for model_desc, model in model_grid.items():
    if model_desc not in model_performance:
        model.fit(train_x, train_y)
        predictions = model.predict(valid_x)
        model_performance[model_desc] = get_performance(valid_y, predictions)

Wall time: 47.6 s


In [16]:
(
    pd.DataFrame(model_performance).T
    .set_axis(['accuracy', 'balanced accuracy', 'F1'], axis='columns')
    .sort_values('F1', ascending=False)
)

Unnamed: 0,accuracy,balanced accuracy,F1
"Count Vectorizer, binary",0.961081,0.754127,0.64938
Count Vectorizer,0.960706,0.753925,0.647191
Count Vectorizer (15000 features),0.96033,0.750428,0.641808
Count Vectorizer (10000 features),0.960205,0.75036,0.641084
"Count Vectorizer (ngrams: 1,2)",0.960706,0.744038,0.637413
Count Vectorizer (5000 features),0.958829,0.74138,0.624857
TF-IDF (5000 features),0.952321,0.672787,0.505837
TF-IDF (10000 features),0.950444,0.661066,0.478947
Count Vectorizer + TruncatedSVD(500),0.947816,0.666245,0.476788
Count Vectorizer (1000 features),0.94669,0.659871,0.462121


Выводы:
- для данной задачи CountVectorizer сработал лучше, чем TF-IDF
- увеличение количества признаков идет на пользу CountVectorizer, а в случае TF-IDF - до определенного предела (нужно подбирать)
- интересно, что бинарный CountVectorizer (binary=True) сработал чуть лучше, чем вариант со счетчиком
- применение PCA (точнее, TruncatedSVD) ухудшило метрику
- включение ngram эффекта в данной задаче не принесло