# Урок 2. Создание признакового пространства

- Создайте мешок слов с помощью sklearn.feature_extraction.text.CountVectorizer.fit_transform(). Применим его к 'tweet_stemmed' и 'tweet_lemmatized' отдельно.
Игнорируем слова, частота которых в документе строго превышает порог 0.9 с помощью max_df.
Ограничим количество слов, попадающий в мешок, с помощью max_features = 1000.
Исключим стоп-слова с помощью stop_words='english'. 
Отобразим Bag-of-Words модель как DataFrame. columns необходимо извлечь с помощью CountVectorizer.get_feature_names().

- Создайте мешок слов с помощью sklearn.feature_extraction.text.TfidfVectorizer.fit_transform(). Применим его к 'tweet_stemmed' и 'tweet_lemmatized' отдельно.
Игнорируем слова, частота которых в документе строго превышает порог 0.9 с помощью max_df.
Ограничим количество слов, попадающий в мешок, с помощью max_features = 1000.
Исключим стоп-слова с помощью stop_words='english'.
Отобразим Bag-of-Words модель как DataFrame. columns необходимо извлечь с помощью TfidfVectorizer.get_feature_names().

- Проверьте ваши векторайзеры на корпусе который использовали на вебинаре, составьте таблицу метод векторизации и скор который вы получили (в методах векторизации по изменяйте параметры что бы добиться лучшего скора) обратите внимание как падает/растёт скор при уменьшении количества фичей, и изменении параметров, так же попробуйте применить к векторайзерам PCA для сокращения размерности посмотрите на качество сделайте выводы


In [1]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, preprocessing, linear_model
from sklearn.metrics import accuracy_score
from sklearn.model_selection import ParameterGrid

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_pickle('df.pkl')
df.head()

Unnamed: 0,id,label,tweet,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
0,1,0.0,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[father, dysfunct, selfish, drag, kid, dysfunc...","[father, dysfunctional, selfish, drag, kid, dy..."
1,2,0.0,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, lyft, credit, use, caus, offer, wheelc...","[thanks, lyft, credit, use, cause, offer, whee..."
2,3,0.0,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]","[bihday, majesty]"
3,4,0.0,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time, ur]","[model, love, take, time, ur]","[model, love, take, time, ur]"
4,5,0.0,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, motiv]","[factsguide, society, motivation]"


In [3]:
corpus_stemmed = df['tweet_stemmed'].apply(lambda x: ' '.join(x)).to_list()
corpus_lemmatized = df['tweet_lemmatized'].apply(lambda x: ' '.join(x)).to_list()

## Задание 1: CountVectorizer

In [4]:
def create_count_vectorizer(**kwargs) -> pd.DataFrame:
    count_vectorizer = CountVectorizer(
        # ngram_range=[1,3],
        max_df=0.9,
        max_features=1000,
        stop_words='english',
        binary=False,
        lowercase=False,
        tokenizer=str.split,
        **kwargs
    )

    return count_vectorizer

In [5]:
count_vect_stemmed = create_count_vectorizer()

df_stemmed = pd.DataFrame(
    count_vect_stemmed.fit_transform(corpus_stemmed).toarray(),
    columns=count_vect_stemmed.get_feature_names()
)
df_stemmed.head()

Unnamed: 0,abl,absolut,accept,account,act,action,actor,actual,ad,adapt,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
count_vect_lemmatized = create_count_vectorizer()

df_lemmatized = pd.DataFrame(
    count_vect_lemmatized.fit_transform(corpus_lemmatized).toarray(),
    columns=count_vect_lemmatized.get_feature_names()
)
df_lemmatized.head()

Unnamed: 0,able,absolutely,account,act,action,actor,actually,adapt,add,adventure,...,year,yes,yesterday,yo,yoga,york,young,youtube,yr,yummy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Задание 2: TfidfVectorizer

In [7]:
def create_tfidf_vectorizer(**kwargs) -> pd.DataFrame:
    tfidf_vectorizer = TfidfVectorizer(
        # ngram_range=[1,3],
        max_df=0.9,
        max_features=1000,
        stop_words='english',
        binary=False,
        lowercase=False,
        tokenizer=str.split,
    )
    return tfidf_vectorizer

In [8]:
tfidf_stemmed = create_tfidf_vectorizer()

df_stemmed_tfidf = pd.DataFrame(
    tfidf_stemmed.fit_transform(corpus_stemmed).toarray(),
    columns=tfidf_stemmed.get_feature_names()
)
df_stemmed_tfidf.head()

Unnamed: 0,abl,absolut,accept,account,act,action,actor,actual,ad,adapt,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
tfidf_lemmatized = create_tfidf_vectorizer()

df_lemmatized_tfidf = pd.DataFrame(
    tfidf_lemmatized.fit_transform(corpus_stemmed).toarray(),
    columns=tfidf_lemmatized.get_feature_names()
)
df_lemmatized_tfidf.head()

Unnamed: 0,abl,absolut,accept,account,act,action,actor,actual,ad,adapt,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Задание 3: CountVectorizer

In [10]:
labels, texts = [], []
with open('web/les2/corpus', 'r') as data:
    for i, line in enumerate(data):
        content = line.split()
        labels.append(content[0])
        texts.append(" ".join(content[1:]))

df_test = pd.DataFrame()
df_test['text'] = texts
df_test['label'] = labels
df_test.head()

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tra...,__label__2
1,The best soundtrack ever to anything.: I'm rea...,__label__2
2,Amazing!: This soundtrack is my favorite music...,__label__2
3,Excellent Soundtrack: I truly like this soundt...,__label__2
4,"Remember, Pull Your Jaw Off The Floor After He...",__label__2


In [11]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    df_test['text'],
    df_test['label']
)

enc = preprocessing.LabelEncoder()
y_train = enc.fit_transform(y_train)
y_test = enc.fit_transform(y_test)

In [12]:
def grid_search_count_vect(param_grid: dict) -> dict:
    results: dict = dict()
    grid: list = list(ParameterGrid(param_grid))
    cnt: int = 0
    for params in grid:
        cnt += 1
        results[cnt] = params
        
        count_vect = CountVectorizer(
            token_pattern=r'\w{1,}',
            stop_words='english',
            tokenizer=str.split,
            **params,
        )
        count_vect.fit(X_train)

        X_train_count =  count_vect.transform(X_train)
        X_test_count =  count_vect.transform(X_test)

        classifier = linear_model.LogisticRegression()
        classifier.fit(X_train_count, y_train)

        y_pred = classifier.predict(X_test_count)

        results[cnt].update({'accuracy': accuracy_score(y_test, y_pred)})

    return results

param_grid = {
    'max_df': [0.8, 0.9, 1],
    'max_features': [500, 1000, 5000, 10000],
}
results = grid_search_count_vect(param_grid)

In [13]:
pd.DataFrame.from_dict(results, orient='index').sort_values(by='accuracy', ascending=False)

Unnamed: 0,max_df,max_features,accuracy
4,0.8,10000,0.8268
8,0.9,10000,0.8268
3,0.8,5000,0.82
7,0.9,5000,0.82
2,0.8,1000,0.8048
6,0.9,1000,0.8048
1,0.8,500,0.7856
5,0.9,500,0.7856
12,1.0,10000,0.54
11,1.0,5000,0.5272


## Задание 3: TfidfVectorizer

In [14]:
def grid_search_tfidf(param_grid: dict) -> dict:
    results: dict = dict()
    grid: list = list(ParameterGrid(param_grid))
    cnt: int = 0
    for params in grid:
        cnt += 1
        results[cnt] = params
        
        count_vect = TfidfVectorizer(
            token_pattern=r'\w{1,}',
            stop_words='english',
            tokenizer=str.split,
            **params,
        )
        count_vect.fit(X_train)

        X_train_count =  count_vect.transform(X_train)
        X_test_count =  count_vect.transform(X_test)

        classifier = linear_model.LogisticRegression()
        classifier.fit(X_train_count, y_train)

        y_pred = classifier.predict(X_test_count)

        results[cnt].update({'accuracy': accuracy_score(y_test, y_pred)})

    return results

param_grid = {
    'max_df': [0.8, 0.9, 1],
    'max_features': [500, 1000, 5000, 10000],
}
results_tfidf = grid_search_tfidf(param_grid)

In [15]:
pd.DataFrame.from_dict(results_tfidf, orient='index').sort_values(by='accuracy', ascending=False)

Unnamed: 0,max_df,max_features,accuracy
4,0.8,10000,0.8388
8,0.9,10000,0.8388
3,0.8,5000,0.8376
7,0.9,5000,0.8376
2,0.8,1000,0.8132
6,0.9,1000,0.8132
1,0.8,500,0.7868
5,0.9,500,0.7868
12,1.0,10000,0.54
11,1.0,5000,0.5276


## Вывод

TfidfVectorizer дает сопоставимую точность (даже лучше) при вдвое меньшем количестве признаков по сравнению с CountVectorizer.