In [1]:
import pickle
import warnings
warnings.filterwarnings("ignore")

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn import model_selection, preprocessing, linear_model
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD

import pandas as pd

In [2]:
with open('prep_tweets_hw1.pkl', "rb+") as f:
    data = pickle.load(f)

In [3]:
def to_pd(bow, model):
    return pd.DataFrame(bow.toarray(), columns = model.get_feature_names())

## Задание 1

* Создайте мешок слов с помощью sklearn.feature_extraction.text.CountVectorizer.fit_transform(). 
* Применим его к 'tweet_stemmed' и 'tweet_lemmatized' отдельно.
* Игнорируем слова, частота которых в документе строго превышает порог 0.9 с помощью max_df.
* Ограничим количество слов, попадающий в мешок, с помощью max_features = 1000.
* Исключим стоп-слова с помощью stop_words='english'.
* Отобразим Bag-of-Words модель как DataFrame. columns необходимо извлечь с помощью CountVectorizer.get_feature_names().

In [4]:
count_vectorizer = CountVectorizer(ngram_range=(1, 1), 
                                   max_df=0.9,
                                   preprocessor=' '.join,
                                   max_features=1000,
                                   stop_words='english',
                                   analyzer='word', 
                                   binary=False,)

tweet_stemmed_cv = count_vectorizer.fit_transform(data['tweet_stemmed'])
tweet_stemmed_cv_pd = to_pd(tweet_stemmed_cv, count_vectorizer)

tweet_lemmatized_cv = count_vectorizer.fit_transform(data['tweet_lemmatized'])
tweet_lemmatized_cv_pd = to_pd(tweet_lemmatized_cv, count_vectorizer)

In [5]:
tweet_stemmed_cv_pd.head()

Unnamed: 0,abl,absolut,accept,account,act,action,actor,actual,ad,adapt,...,year,yesterday,yo,yoga,york,young,youth,youtub,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
tweet_lemmatized_cv_pd.head()

Unnamed: 0,able,absolutely,account,act,action,actor,actually,adapt,add,adult,...,year,yes,yesterday,yo,yoga,york,young,youtube,yr,yummy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Задание 2

* Создайте мешок слов с помощью sklearn.feature_extraction.text.TfidfVectorizer.fit_transform(). 
* Применим его к 'tweet_stemmed' и 'tweet_lemmatized' отдельно.
* Игнорируем слова, частота которых в документе строго превышает порог 0.9 с помощью max_df.
* Ограничим количество слов, попадающий в мешок, с помощью max_features = 1000.
* Исключим стоп-слова с помощью stop_words='english'.
* Отобразим Bag-of-Words модель как DataFrame. columns необходимо извлечь с помощью TfidfVectorizer.get_feature_names().

In [7]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1), 
                                   max_df=0.9,
                                   preprocessor=' '.join,
                                   max_features=1000,
                                   stop_words='english',
                                   analyzer='word', 
                                   binary=False,)

tweet_stemmed_tfidf = tfidf_vectorizer.fit_transform(data['tweet_stemmed'])
tweet_stemmed_tfidf_pd = to_pd(tweet_stemmed_tfidf, tfidf_vectorizer)

tweet_lemmatized_tfidf = tfidf_vectorizer.fit_transform(data['tweet_lemmatized'])
tweet_lemmatized_tfidf_pd = to_pd(tweet_lemmatized_tfidf, tfidf_vectorizer)

In [8]:
tweet_stemmed_tfidf_pd.head()

Unnamed: 0,abl,absolut,accept,account,act,action,actor,actual,ad,adapt,...,year,yesterday,yo,yoga,york,young,youth,youtub,yr,yummi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
tweet_lemmatized_tfidf_pd.head()

Unnamed: 0,able,absolutely,account,act,action,actor,actually,adapt,add,adult,...,year,yes,yesterday,yo,yoga,york,young,youtube,yr,yummy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Задание 3

* Создайте мешок слов с помощью sklearn.feature_extraction.text.HashingVectorizer.fit_transform(). 
* Применим его к 'tweet_stemmed' и 'tweet_lemmatized' отдельно.

In [10]:
hash_vectorizer = HashingVectorizer(ngram_range=(1, 1), 
                                   preprocessor=' '.join,
                                   n_features=1000,
                                   stop_words='english',
                                   analyzer='word', 
                                   binary=False,)

tweet_stemmed_hv = tfidf_vectorizer.fit_transform(data['tweet_stemmed'])

tweet_lemmatized_hv = tfidf_vectorizer.fit_transform(data['tweet_lemmatized'])

## Задание 4

Проверьте ваши векторайзеры на корпусе который использовали на вебинаре, составьте таблицу метод векторизации и скор который вы получили (в методах векторизации по изменяйте параметры что бы добиться лучшего скора) обратите внимание как падает/растёт скор при уменьшении количества фичей, и изменении параметров, так же попробуйте применить к векторайзерам PCA для сокращения размерности посмотрите на качество сделайте выводы

In [11]:
# Загружаем данные
with open('corpus.txt', 'r') as f:
    corpus = f.read()
    
labels, texts = [], []
for i, line in enumerate(corpus.split('\n')):
    content = line.split()
    labels.append(content[0])
    texts.append(' '.join(content[1:]))

# создаем df
trainDF = pd.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels
trainDF.head(2)

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tra...,__label__2
1,The best soundtrack ever to anything.: I'm rea...,__label__2


### Webinar

In [12]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'])

# labelEncode целевую переменную
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)


count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

classifier = linear_model.LogisticRegression()
classifier.fit(xtrain_count, train_y)
predictions = classifier.predict(xvalid_count)

accuracy = {'webinar': accuracy_score(valid_y, predictions)}
accuracy['webinar']

0.8596

### My count vectorizer

In [13]:
count_vectorizer = CountVectorizer(ngram_range=(1, 1), 
                                   max_df=0.9,
                                   max_features=1000,
                                   stop_words='english',
                                   analyzer='word', 
                                   binary=False,
                                   token_pattern=r'\w{1,}')

count_vectorizer.fit(trainDF['text'])

xtrain_count =  count_vectorizer.transform(train_x)
xvalid_count =  count_vectorizer.transform(valid_x)

classifier = linear_model.LogisticRegression()
classifier.fit(xtrain_count, train_y)
predictions = classifier.predict(xvalid_count)

accuracy['count_vectorizer'] = accuracy_score(valid_y, predictions)
accuracy['count_vectorizer']

0.8248

### My tfidf vectorizer

In [14]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1), 
                                   max_df=0.9,
                                   max_features=1000,
                                   stop_words='english',
                                   analyzer='word', 
                                   binary=False,
                                   token_pattern=r'\w{1,}')

tfidf_vectorizer.fit(trainDF['text'])

xtrain_count =  tfidf_vectorizer.transform(train_x)
xvalid_count =  tfidf_vectorizer.transform(valid_x)

classifier = linear_model.LogisticRegression()
classifier.fit(xtrain_count, train_y)
predictions = classifier.predict(xvalid_count)

accuracy['tfidf_vectorizer'] = accuracy_score(valid_y, predictions)
accuracy['tfidf_vectorizer']

0.8308

### My hash vectorizer

In [15]:
hash_vectorizer = HashingVectorizer(ngram_range=(1, 1), 
                                   n_features=1000,
                                   stop_words='english',
                                   analyzer='word', 
                                   binary=False,
                                   token_pattern=r'\w{1,}')

hash_vectorizer.fit(trainDF['text'])

xtrain_count =  hash_vectorizer.transform(train_x)
xvalid_count =  hash_vectorizer.transform(valid_x)

classifier = linear_model.LogisticRegression()
classifier.fit(xtrain_count, train_y)
predictions = classifier.predict(xvalid_count)

accuracy['hash_vectorizer'] = accuracy_score(valid_y, predictions)
accuracy['hash_vectorizer']

0.7772

### Default tfidf

In [16]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', 
                                   token_pattern=r'\w{1,}')

tfidf_vectorizer.fit(trainDF['text'])

xtrain_count =  tfidf_vectorizer.transform(train_x)
xvalid_count =  tfidf_vectorizer.transform(valid_x)

classifier = linear_model.LogisticRegression()
classifier.fit(xtrain_count, train_y)
predictions = classifier.predict(xvalid_count)

accuracy['default_tfidf'] = accuracy_score(valid_y, predictions)
accuracy['default_tfidf']

0.8644

### Default tfidf + понижение размерности

In [17]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', 
                                   token_pattern=r'\w{1,}')

tfidf_vectorizer.fit(trainDF['text'])

xtrain_count =  tfidf_vectorizer.transform(train_x)
xvalid_count =  tfidf_vectorizer.transform(valid_x)

tsvd = TruncatedSVD(n_components=1000, random_state=1)
xtrain_count = tsvd.fit_transform(xtrain_count)
xvalid_count = tsvd.transform(xvalid_count)

classifier = linear_model.LogisticRegression()
classifier.fit(xtrain_count, train_y)
predictions = classifier.predict(xvalid_count)

accuracy['default_tfidf_tsvd'] = accuracy_score(valid_y, predictions)
accuracy['default_tfidf_tsvd']

0.8592

### Результаты

In [18]:
accuracy

{'webinar': 0.8596,
 'count_vectorizer': 0.8248,
 'tfidf_vectorizer': 0.8308,
 'hash_vectorizer': 0.7772,
 'default_tfidf': 0.8644,
 'default_tfidf_tsvd': 0.8592}

Самый высокий скор у tfidf на базовых настройках, понижение размерности до 1000 компонент ухудшило скор.