In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from textblob import TextBlob
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from nltk.corpus import stopwords
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from gensim.models import Word2Vec
from sklearn.preprocessing import scale
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from termcolor import colored
import warnings
warnings.filterwarnings("ignore")

In [2]:
## Считывание данных
negative = pd.read_csv('processedNegative.csv')
neutral = pd.read_csv('processedNeutral.csv')
positive = pd.read_csv('processedPositive.csv')

In [3]:
## Преобразование
df_1 = pd.DataFrame({'tweets': negative.columns, 'type': 'negative'})
df_2 = pd.DataFrame({'tweets': neutral.columns, 'type': 'neutral'})
df_3 = pd.DataFrame({'tweets': positive.columns, 'type': 'positive'})

In [4]:
## Объединение фреймов
df = pd.concat([df_1, df_2, df_3]).reset_index(drop=True)

In [5]:
## Удаление дубликатов(так как мы имели дело с заголовками, то к повторяющимся добавились цифры в конце)
for i, j in enumerate(df.tweets):
    df.tweets.iloc[i] = df.tweets.iloc[i].lower()
    df.tweets.iloc[i] = re.sub('[^a-zA-Z]', ' ', df.tweets.iloc[i])
    df.tweets.iloc[i] = re.sub(r'\s+', ' ', df.tweets.iloc[i])
    df.tweets.iloc[i] = df.tweets.iloc[i].strip()

In [6]:
## Разбиваем на тренировочную и тестовую группу
X_train, X_test, y_train, y_test = train_test_split(df['tweets'], df['type'], 
                                                    test_size=0.2, random_state=21, stratify=df['type'])
X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_train.index = np.arange(len(y_train))
y_test.index = np.arange(len(y_test))

In [7]:
## Удаляем дублирующиеся твиты
X_train.drop_duplicates(inplace=True, keep='first')
y_train = y_train[y_train.index.isin(X_train.index)]
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

In [8]:
## Находим индексы твитов, состоящих из 1 слова
one_word = []
for i, row in enumerate(X_train):
    if len(row.split()) <= 1:
        one_word.append(i)

In [9]:
## Удаляем твиты из 1 слова, так как они не несут информации
X_train = X_train[~X_train.index.isin(one_word)]
y_train = y_train[y_train.index.isin(X_train.index)]
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

## Data preparation

In [10]:
preparation_data = pd.DataFrame()

In [11]:
preparation_data_test = pd.DataFrame()

#### Токенизация по словам - разделение предложений на слова компоненты

In [12]:
token = []
for i in range(X_train.shape[0]):
    token.append(' '.join(nltk.word_tokenize(X_train[i])))

In [13]:
preparation_data['token'] = token

In [14]:
token = []
for i in range(X_test.shape[0]):
    token.append(' '.join(nltk.word_tokenize(X_test[i])))

In [15]:
preparation_data_test['token'] = token

#### Лемматизация - приведение слов к канонической форме
#### Стемминг - отрезание "лишнего" от корня слов

In [16]:
lm = WordNetLemmatizer()
st = PorterStemmer()
lemma = []
stemmer = []
for i in range(X_train.shape[0]):
    tweet = X_train[i].split()
    stemmer.append([])
    lemma.append([])
    for j in tweet:
        stemmer[i].append(st.stem(word=j))
        lemma[i].append(lm.lemmatize(word=j))
    stemmer[i] = ' '.join(stemmer[i])
    lemma[i] = ' '.join(lemma[i])

In [17]:
preparation_data['stemmer'] = stemmer
preparation_data['lemma'] = lemma

In [18]:
lm = WordNetLemmatizer()
st = PorterStemmer()
lemma = []
stemmer = []
for i in range(X_test.shape[0]):
    tweet = X_test[i].split()
    stemmer.append([])
    lemma.append([])
    for j in tweet:
        stemmer[i].append(st.stem(word=j))
        lemma[i].append(lm.lemmatize(word=j))
    stemmer[i] = ' '.join(stemmer[i])
    lemma[i] = ' '.join(lemma[i])

In [19]:
preparation_data_test['stemmer'] = stemmer
preparation_data_test['lemma'] = lemma

#### Исправление ошибок

In [20]:
nomiss = [0] * X_train.shape[0]
for i in range(X_train.shape[0]):
    textBlb = TextBlob(X_train[i])
    nomiss[i] = ''.join(textBlb.correct())

In [21]:
preparation_data['nomiss'] = nomiss

In [22]:
nomiss = [0] * X_test.shape[0]
for i in range(X_test.shape[0]):
    textBlb = TextBlob(X_test[i])
    nomiss[i] = ''.join(textBlb.correct())

In [23]:
preparation_data_test['nomiss'] = nomiss

#### Лемматизация и стемминг "безошибочных" датафреймов

In [24]:
lm = WordNetLemmatizer()
st = PorterStemmer()
lemma = []
stemmer = []
for i in range(X_train.shape[0]):
    tweet = preparation_data['nomiss'][i]
    stemmer.append([])
    lemma.append([])
    for j in tweet:
        stemmer[i].append(st.stem(word=j))
        lemma[i].append(lm.lemmatize(word=j))
    stemmer[i] = ''.join(stemmer[i])
    lemma[i] = ''.join(lemma[i])

In [25]:
preparation_data['miss_stemmer'] = stemmer
preparation_data['miss_lemma'] = lemma

In [26]:
lm = WordNetLemmatizer()
st = PorterStemmer()
lemma = []
stemmer = []
for i in range(X_test.shape[0]):
    tweet = preparation_data_test['nomiss'][i]
    stemmer.append([])
    lemma.append([])
    for j in tweet:
        stemmer[i].append(st.stem(word=j))
        lemma[i].append(lm.lemmatize(word=j))
    stemmer[i] = ''.join(stemmer[i])
    lemma[i] = ''.join(lemma[i])

In [27]:
preparation_data_test['miss_stemmer'] = stemmer
preparation_data_test['miss_lemma'] = lemma

## Создаём функции для работы с датафреймами

In [28]:
## блок функций, позволяющий находить 10 похожих пар с помощью k-means и косинусного расстояния
def pair_kmeans(data):
    kmeans = KMeans(n_clusters=2, random_state=0).fit(data)
    result =  kmeans.labels_
    x = [i for i, ltr in enumerate(result) if ltr == 0]
    y = [i for i, ltr in enumerate(result) if ltr == 1]
    ln_x = len(x)
    ln_y = len(y)
    if ln_y < ln_x:
        if ln_y > 1:
            return data[data.index.isin(y)]
        else:
            return data[data.index.isin(x)]
    if ln_x <= ln_y:
        if ln_x > 1:
            return data[data.index.isin(x)]
        else:
            return data[data.index.isin(y)]

        
def drop_reset(data, names, indexes):
    names.drop(indexes, inplace = True, axis = 0)
    names.reset_index(drop=True, inplace=True)
    data.drop(indexes, inplace = True, axis = 0)
    data.reset_index(drop=True, inplace=True)


def find_ten_pairs(data, names):
    i = 0
    while i < 10:
        indexes = [x for x in data.index]
        df = pair_kmeans(data)
        while df.shape[0] > 2:
            indexes = [i for j, i in enumerate(indexes) if j in df.index]
            df.reset_index(inplace=True, drop=True)
            df = pair_kmeans(df)
        indexes = [i for j, i in enumerate(indexes) if j in df.index]
        if cosine_similarity([data.iloc[indexes[0]]], [data.iloc[indexes[1]]]) >= 0.8:
            i += 1
            print(f'{colored(i, "red", attrs=["bold"])} {names.iloc[indexes[0]]}\n  {names.iloc[indexes[1]]}', end='\n\n')
        drop_reset(data, names, indexes)

In [29]:
## Блок функций для создания фреймов с помощью word2vec
def create_dictionary(data):
    all_words = [x.split() for x in data]
    for i in range(len(all_words)):
        all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]
    model = Word2Vec(all_words, min_count=4, vector_size=100, workers=3, window=2, sg=1)
    vocabulary = model.wv
    return model, vocabulary


def data_to_vectors(data, size, model, vocabulary):
    vectors = []
    for i, row in enumerate(data):
        vector = np.zeros(size).reshape((1, size))
        for word in row.split():
            try:
                vector += vocabulary[model.wv.key_to_index[word]].reshape((1, size))
            except KeyError:
                continue
        vectors.append(vector)
    vectors = scale(np.concatenate([x for x in vectors]))
    return pd.DataFrame(vectors)


def create_vectors(data, data_test, size):
    model, vocabulary = create_dictionary(data)
    train = data_to_vectors(data, size, model, vocabulary)
    test = data_to_vectors(data_test, size, model, vocabulary)
    train.drop_duplicates(keep='first', inplace=True)
    y_y_train = y_train[y_train.index.isin(train.index)]
    x_train = X_train[X_train.index.isin(train.index)]
    train.reset_index(drop=True, inplace=True)
    y_y_train.reset_index(drop=True, inplace=True)
    x_train.reset_index(drop=True, inplace=True)
    return train, test, x_train, y_y_train

In [30]:
## Функция создания датафреймов для каждого вида обработки данных
def to_bag(train_data, test_data, vectorizer):
    train = train_data.to_list()
    test = test_data.to_list()
    bag_of_words = vectorizer.fit_transform(train)
    bag_of_words_test = vectorizer.transform(test)
    feature_names = vectorizer.get_feature_names()
    df_bag = pd.DataFrame(bag_of_words.toarray(), columns = feature_names)
    df_bag_test = pd.DataFrame(bag_of_words_test.toarray(), columns = feature_names)
    df_bag.drop_duplicates(keep='first', inplace = True)
    X_train_for_bag = X_train[X_train.index.isin(df_bag.index)]
    y_train_for_bag = y_train[y_train.index.isin(df_bag.index)]
    df_bag.reset_index(drop=True, inplace=True)
    X_train_for_bag.reset_index(drop=True, inplace=True)
    y_train_for_bag.reset_index(drop=True, inplace=True)
    return df_bag, df_bag_test, X_train_for_bag, y_train_for_bag

## Создаём мешки слов со стоп-словами

In [31]:
stop_words = set(stopwords.words("english"))
count_vectorizer = CountVectorizer(stop_words=stop_words)

In [32]:
## Датафреймы для токенов в мешках слов и нахождение 10 пар похожих твитов
df_token_bag, df_token_bag_test, X_train_for_token_bag, y_train_for_token_bag = to_bag(preparation_data['token'], 
                                                                                       preparation_data_test['token'], count_vectorizer)
find_ten_pairs(df_token_bag.copy(), X_train_for_token_bag.copy())

[1m[31m1[0m thanks for the recent follow happy to connect happy have a great wednesday
  thanks for the recent follow happy to connect happy have a great wednesday want this

[1m[31m2[0m i miss him unhappy
  where s justin i miss him unhappy

[1m[31m3[0m hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number cont
  hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number for us to cont

[1m[31m4[0m i love them with all my hort unhappy
  love it unhappy

[1m[31m5[0m hey thanks for being top new followers this week much appreciated happy want this
  hey thanks for being top new followers this week much appreciated happy

[1m[31m6[0m have a great thursday looking forward to reading your tweets happy want this
  have a great thursday looking forward to reading your tweets happy want this it s free

[1m[31m7[0m share the love thanks for being top

In [33]:
## Датафреймы для стеммера в мешках слов и нахождение 10 пар похожих твитов
df_stemmer_bag, df_stemmer_bag_test, X_train_for_stemmer_bag, y_train_for_stemmer_bag = to_bag(preparation_data['stemmer'], 
                                                                                               preparation_data_test['stemmer'], count_vectorizer)
find_ten_pairs(df_stemmer_bag.copy(), X_train_for_stemmer_bag.copy())

[1m[31m1[0m hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number cont
  hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number for us to cont

[1m[31m2[0m thanks for being top engaged community members this week happy want this
  thanks for being top engaged community members this week happy want this it s free

[1m[31m3[0m i miss him unhappy
  where s justin i miss him unhappy

[1m[31m4[0m thanks for being top engaged community members this week happy try this too
  thanks for being top engaged community members this week happy i sent this with

[1m[31m5[0m thanks for the recent follow happy to connect happy have a great thursday want this
  thanks for the recent follow happy to connect happy have a great thursday want this it s free

[1m[31m6[0m have a great thursday looking forward to reading your tweets happy want this
  have a great th

In [34]:
## Датафреймы для леммы в мешках слов и нахождение 10 пар похожих твитов
df_lemma_bag, df_lemma_bag_test, X_train_for_lemma_bag, y_train_for_lemma_bag = to_bag(preparation_data['lemma'], 
                                                                                       preparation_data_test['lemma'], count_vectorizer)
find_ten_pairs(df_lemma_bag.copy(), X_train_for_lemma_bag.copy())

[1m[31m1[0m thanks for the recent follow much appreciated happy want this
  thanks for the recent follow much appreciated happy want this for it s magical

[1m[31m2[0m hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number cont
  hi ashish we tried to call your number but got no response unhappy please share another suitable time and an alternate cont

[1m[31m3[0m i miss him unhappy
  where s justin i miss him unhappy

[1m[31m4[0m have a great thursday looking forward to reading your tweets happy want this
  have a great thursday looking forward to reading your tweets happy want this it s free

[1m[31m5[0m share the love thanks for being top new followers this week happy get it
  share the love thanks for being top new followers this week happy

[1m[31m6[0m thanks for being top engaged community members this week happy want this
  thanks for being top engaged community members this week happy want this it

In [35]:
## Датафреймы для стеммера в мешках слов с обработкой ошибок и нахождение 10 пар похожих твитов
df_miss_stemmer_bag, df_miss_stemmer_bag_test, X_train_for_miss_stemmer_bag, y_train_for_miss_stemmer_bag = to_bag(preparation_data['miss_stemmer'], 
                                                                                                                   preparation_data_test['miss_stemmer'], count_vectorizer)
find_ten_pairs(df_miss_stemmer_bag.copy(), X_train_for_miss_stemmer_bag.copy())

[1m[31m1[0m hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number cont
  hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number for us to cont

[1m[31m2[0m i miss my boo so much unhappy
  i miss him unhappy

[1m[31m3[0m hey thanks for being top new followers this week much appreciated happy want this
  hey thanks for being top new followers this week much appreciated happy

[1m[31m4[0m thanks for the recent follow much appreciated happy want this
  thanks for the recent follow much appreciated happy want this for it s magical

[1m[31m5[0m thanks for the recent follow much appreciated happy i sent this with
  thanks for the recent follow much appreciated happy

[1m[31m6[0m thanks for the recent follow happy to connect happy have a great wednesday
  thanks for the recent follow happy to connect happy have a great wednesday want this

[1m[31m

In [36]:
## Датафреймы для леммы в мешках слов с обработкой ошибок и нахождение 10 пар похожих твитов
df_miss_lemma_bag, df_miss_lemma_bag_test, X_train_for_miss_lemma_bag, y_train_for_miss_lemma_bag = to_bag(preparation_data['miss_lemma'], 
                                                                                                           preparation_data_test['miss_lemma'], count_vectorizer)
find_ten_pairs(df_miss_lemma_bag.copy(), X_train_for_miss_lemma_bag.copy())

[1m[31m1[0m hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number cont
  hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number for us to cont

[1m[31m2[0m i miss my boo so much unhappy
  i miss him unhappy

[1m[31m3[0m hey thanks for being top new followers this week much appreciated happy want this
  hey thanks for being top new followers this week much appreciated happy

[1m[31m4[0m thanks for the recent follow much appreciated happy want this
  thanks for the recent follow much appreciated happy want this for it s magical

[1m[31m5[0m thanks for the recent follow much appreciated happy i sent this with
  thanks for the recent follow much appreciated happy

[1m[31m6[0m thanks for the recent follow happy to connect happy have a great wednesday
  thanks for the recent follow happy to connect happy have a great wednesday want this

[1m[31m

## Создаём бинарные мешки слов со стоп-словами

In [37]:
count_vectorizer_bool = CountVectorizer(binary=True, stop_words=stop_words)

In [38]:
## Датафреймы для токенов в бинарных мешках слов и нахождение 10 пар похожих твитов
df_token_bag_binary, df_token_bag_binary_test, X_train_for_token_bag_binary, y_train_for_token_bag_binary = to_bag(preparation_data['token'], 
                                                                                                                   preparation_data_test['token'], count_vectorizer_bool)
find_ten_pairs(df_token_bag_binary.copy(), X_train_for_token_bag_binary.copy())

[1m[31m1[0m i miss him unhappy
  where s justin i miss him unhappy

[1m[31m2[0m hey thanks for being top new followers this week much appreciated happy want this
  hey thanks for being top new followers this week much appreciated happy

[1m[31m3[0m hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number cont
  hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number for us to cont

[1m[31m4[0m share the love thanks for being top new followers this week happy get it
  share the love thanks for being top new followers this week happy

[1m[31m5[0m thanks for being top engaged community members this week happy want this
  thanks for being top engaged community members this week happy want this it s free

[1m[31m6[0m thanks for the recent follow much appreciated happy want this
  thanks for the recent follow much appreciated happy want this for it s 

In [39]:
## Датафреймы для стеммера в бинарных мешках слов и нахождение 10 пар похожих твитов
df_stemmer_bag_binary, df_stemmer_bag_binary_test, X_train_for_stemmer_bag_binary, y_train_for_stemmer_bag_binary = to_bag(preparation_data['stemmer'], 
                                                                                                                           preparation_data_test['stemmer'], count_vectorizer_bool)
find_ten_pairs(df_stemmer_bag_binary.copy(), X_train_for_stemmer_bag_binary.copy())

[1m[31m1[0m hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number cont
  hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number for us to cont

[1m[31m2[0m hey thanks for being top new followers this week much appreciated happy want this
  hey thanks for being top new followers this week much appreciated happy

[1m[31m3[0m i miss him unhappy
  where s justin i miss him unhappy

[1m[31m4[0m share the love thanks for being top new followers this week happy get it
  share the love thanks for being top new followers this week happy

[1m[31m5[0m thanks for the recent follow much appreciated happy want this
  thanks for the recent follow much appreciated happy want this for it s magical

[1m[31m6[0m thanks for the recent follow much appreciated happy i sent this with
  for the recent follow much appreciated happy want this

[1m[31m7[0m thanks f

In [40]:
## Датафреймы для леммы в бинарных мешках слов и нахождение 10 пар похожих твитов
df_lemma_bag_binary, df_lemma_bag_binary_test, X_train_for_lemma_bag_binary, y_train_for_lemma_bag_binary = to_bag(preparation_data['lemma'], 
                                                                                                                   preparation_data_test['lemma'], count_vectorizer_bool)
find_ten_pairs(df_lemma_bag_binary.copy(), X_train_for_lemma_bag_binary.copy())

[1m[31m1[0m i miss him unhappy
  where s justin i miss him unhappy

[1m[31m2[0m hey thanks for being top new followers this week much appreciated happy want this
  hey thanks for being top new followers this week much appreciated happy

[1m[31m3[0m hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number cont
  hi ashish we tried to call your number but got no response unhappy please share another suitable time and an alternate cont

[1m[31m4[0m share the love thanks for being top new followers this week happy get it
  share the love thanks for being top new followers this week happy

[1m[31m5[0m thanks for being top engaged community members this week happy want this
  thanks for being top engaged community members this week happy want this it s free

[1m[31m6[0m rt good evening everyone happy join our twitter party tonight official tagline tanner welcomebackph tmi
  rt almightytanner good evening everyone

In [41]:
## Датафреймы для стеммера в бинарных мешках слов с обработкой ошибок и нахождение 10 пар похожих твитов
df_miss_stemmer_bag_binary, df_miss_stemmer_bag_binary_test, X_train_for_miss_stemmer_bag_binary, y_train_for_miss_stemmer_bag_binary = to_bag(preparation_data['miss_stemmer'], 
                                                                                                                                               preparation_data_test['miss_stemmer'], count_vectorizer_bool)
find_ten_pairs(df_miss_stemmer_bag_binary.copy(), X_train_for_miss_stemmer_bag_binary.copy())

[1m[31m1[0m i miss my boo so much unhappy
  i miss him unhappy

[1m[31m2[0m hey thanks for being top new followers this week much appreciated happy want this
  hey thanks for being top new followers this week much appreciated happy

[1m[31m3[0m hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number cont
  hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number for us to cont

[1m[31m4[0m thanks for the recent follow much appreciated happy want this
  thanks for the recent follow much appreciated happy want this for it s magical

[1m[31m5[0m share the love thanks for being top new followers this week happy get it
  share the love thanks for being top new followers this week happy

[1m[31m6[0m thanks for the recent follow much appreciated happy i sent this with
  thanks for the recent follow much appreciated happy

[1m[31m7[0m please unhappy


In [42]:
## Датафреймы для леммы в бинарных мешках слов с обработкой ошибок и нахождение 10 пар похожих твитов
df_miss_lemma_bag_binary, df_miss_lemma_bag_binary_test, X_train_for_miss_lemma_bag_binary, y_train_for_miss_lemma_bag_binary = to_bag(preparation_data['miss_lemma'], 
                                                                                                                                       preparation_data_test['miss_lemma'], count_vectorizer_bool)
find_ten_pairs(df_miss_lemma_bag_binary.copy(), X_train_for_miss_lemma_bag_binary.copy())

[1m[31m1[0m i miss my boo so much unhappy
  i miss him unhappy

[1m[31m2[0m hey thanks for being top new followers this week much appreciated happy want this
  hey thanks for being top new followers this week much appreciated happy

[1m[31m3[0m hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number cont
  hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number for us to cont

[1m[31m4[0m thanks for the recent follow much appreciated happy want this
  thanks for the recent follow much appreciated happy want this for it s magical

[1m[31m5[0m share the love thanks for being top new followers this week happy get it
  share the love thanks for being top new followers this week happy

[1m[31m6[0m thanks for the recent follow much appreciated happy i sent this with
  thanks for the recent follow much appreciated happy

[1m[31m7[0m please unhappy


## TF-IDF со стоп-словами

In [43]:
## Скоринг по TF-IDF растет пропорционально частоте появления слова в документе, но это компенсируется количеством документов, содержащих это слово.
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words)

In [44]:
## Датафреймы для токенов TF-IDF и нахождение 10 пар похожих твитов
df_token_tfidf, df_token_tfidf_test, X_train_for_token_tfidf, y_train_for_token_tfidf = to_bag(preparation_data['token'], 
                                                                                               preparation_data_test['token'], tfidf_vectorizer)
find_ten_pairs(df_token_tfidf.copy(), X_train_for_token_tfidf.copy())

[1m[31m1[0m hey thanks for being top new followers this week much appreciated happy want this
  hey thanks for being top new followers this week much appreciated happy

[1m[31m2[0m share the love thanks for being top new followers this week happy
  share the love thanks for being top new followers this week happy want it

[1m[31m3[0m thanks for being top engaged community members this week happy want this
  thanks for being top engaged community members this week happy want this it s free

[1m[31m4[0m thanks for the recent follow much appreciated happy want this
  for the recent follow much appreciated happy want this

[1m[31m5[0m thanks for the recent follow happy to connect happy have a great wednesday
  thanks for the recent follow happy to connect happy have a great wednesday want this

[1m[31m6[0m thanks for the recent follow much appreciated happy get it
  thanks for the recent follow much appreciated happy

[1m[31m7[0m thanks for being top engaged community 

In [45]:
## Датафреймы для стеммера TF-IDF и нахождение 10 пар похожих твитов
df_stemmer_tfidf, df_stemmer_tfidf_test, X_train_for_stemmer_tfidf, y_train_for_stemmer_tfidf = to_bag(preparation_data['stemmer'], 
                                                                                                       preparation_data_test['stemmer'], tfidf_vectorizer)
find_ten_pairs(df_stemmer_tfidf.copy(), X_train_for_stemmer_tfidf.copy())

[1m[31m1[0m for being top high value members this week happy
  thanks for being high value member this week

[1m[31m2[0m thanks for being top engaged community members this week happy want this
  thanks for being top engaged community members this week happy want this it s free

[1m[31m3[0m thanks for being top engaged community members this week happy try this too
  thanks for being top engaged community members this week happy i sent this with

[1m[31m4[0m thanks for the recent follow happy to connect happy have a great wednesday
  thanks for the recent follow happy to connect happy have a great wednesday want this

[1m[31m5[0m thanks for the recent follow much appreciated happy want this
  for the recent follow much appreciated happy want this

[1m[31m6[0m much appreciated happy want it
  much appreciated happy want this

[1m[31m7[0m thanks for the recent follow much appreciated happy get it
  thanks for the recent follow much appreciated happy

[1m[31m8[0m t

In [46]:
## Датафреймы для леммы TF-IDF и нахождение 10 пар похожих твитов
df_lemma_tfidf, df_lemma_tfidf_test, X_train_for_lemma_tfidf, y_train_for_lemma_tfidf = to_bag(preparation_data['lemma'], 
                                                                                               preparation_data_test['lemma'], tfidf_vectorizer)
find_ten_pairs(df_lemma_tfidf.copy(), X_train_for_lemma_tfidf.copy())

[1m[31m1[0m hey thanks for being top new followers this week much appreciated happy want this
  hey thanks for being top new followers this week much appreciated happy

[1m[31m2[0m thanks for the follow happy
  thanks for the follow

[1m[31m3[0m share the love thanks for being top new followers this week happy
  share the love thanks for being top new followers this week happy want it

[1m[31m4[0m thanks for the recent follow happy to connect happy have a great wednesday
  thanks for the recent follow happy to connect happy have a great wednesday want this

[1m[31m5[0m thanks for being top engaged community members this week happy want this
  thanks for being top engaged community members this week happy want this it s free

[1m[31m6[0m thanks for the recent follow happy to connect happy have a great thursday get free
  thanks for the recent follow happy to connect happy have a great thursday want this it s free

[1m[31m7[0m thanks for the recent follow happy to co

In [47]:
## Датафреймы для стеммера TF-IDF с обработкой ошибок и нахождение 10 пар похожих твитов
df_miss_stemmer_tfidf, df_miss_stemmer_tfidf_test, X_train_for_miss_stemmer_tfidf, y_train_for_miss_stemmer_tfidf = to_bag(preparation_data['miss_stemmer'], 
                                                                                                                           preparation_data_test['miss_stemmer'], tfidf_vectorizer)
find_ten_pairs(df_miss_stemmer_tfidf.copy(), X_train_for_miss_stemmer_tfidf.copy())

[1m[31m1[0m hey thanks for being top new followers this week much appreciated happy want this
  hey thanks for being top new followers this week much appreciated happy

[1m[31m2[0m share the love thanks for being top new followers this week happy
  share the love thanks for being top new followers this week happy want it

[1m[31m3[0m thanks for the recent follow much appreciated happy get it
  thanks for the recent follow much appreciated happy

[1m[31m4[0m thanks for being top engaged community members this week happy want this
  thanks for being top engaged community members this week happy want this it s free

[1m[31m5[0m thanks for the recent follow much appreciated happy want this
  for the recent follow much appreciated happy want this

[1m[31m6[0m thanks for the recent follow happy to connect happy have a great wednesday
  thanks for the recent follow happy to connect happy have a great wednesday want this

[1m[31m7[0m thanks for the recent follow happy to c

In [48]:
## Датафреймы для леммы TF-IDF слов с обработкой ошибок и нахождение 10 пар похожих твитов
df_miss_lemma_tfidf, df_miss_lemma_tfidf_test, X_train_for_miss_lemma_tfidf, y_train_for_miss_lemma_tfidf = to_bag(preparation_data['miss_lemma'], 
                                                                                                                   preparation_data_test['miss_lemma'], tfidf_vectorizer)
find_ten_pairs(df_miss_lemma_tfidf.copy(), X_train_for_miss_lemma_tfidf.copy())

[1m[31m1[0m hey thanks for being top new followers this week much appreciated happy want this
  hey thanks for being top new followers this week much appreciated happy

[1m[31m2[0m share the love thanks for being top new followers this week happy
  share the love thanks for being top new followers this week happy want it

[1m[31m3[0m thanks for the recent follow much appreciated happy get it
  thanks for the recent follow much appreciated happy

[1m[31m4[0m thanks for being top engaged community members this week happy want this
  thanks for being top engaged community members this week happy want this it s free

[1m[31m5[0m thanks for the recent follow much appreciated happy want this
  for the recent follow much appreciated happy want this

[1m[31m6[0m thanks for the recent follow happy to connect happy have a great wednesday
  thanks for the recent follow happy to connect happy have a great wednesday want this

[1m[31m7[0m thanks for the recent follow happy to c

## Word2Vec

In [49]:
## Датафреймы для токенов word2vec и нахождение 10 пар похожих твитов
df_token_word2vec, df_token_word2vec_test, X_train_for_token_word2vec, y_train_for_token_word2vec = create_vectors(preparation_data['token'], 
                                                                                                                   preparation_data_test['token'], 100)
find_ten_pairs(df_token_word2vec.copy(), X_train_for_token_word2vec.copy())

[1m[31m1[0m feel like i shoyould be telling you to get the fuck out social media byout also feel really mean because unhappy silence love yoyou hope yoyoure okay
  happy birthday sweet sweet girl i hope you have the best day ever love and miss you so much unhappy

[1m[31m2[0m hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number cont
  hi ashish we tried to call your number but got no response unhappy please share another suitable time and an alternate cont

[1m[31m3[0m i need a car unhappy but no car unhappy poor me unhappy
  who would you call when you are roused in midnightreminds me of the time s they were roommates unhappy unhappy unhappy you suck

[1m[31m4[0m thanks so much for following us we d love to know what you think of our flooring range happy
  thank you so much sir happy please watch the film and call me happy

[1m[31m5[0m hey thanks for being top new followers this week much appreciated happ

In [50]:
## Датафреймы для стеммера word2vec и нахождение 10 пар похожих твитов
df_stemmer_word2vec, df_stemmer_word2vec_test, X_train_for_stemmer_word2vec, y_train_for_stemmer_word2vec = create_vectors(preparation_data['stemmer'], 
                                                                                                                           preparation_data_test['stemmer'], 100)
find_ten_pairs(df_stemmer_word2vec.copy(), X_train_for_stemmer_word2vec.copy())

[1m[31m1[0m hey thanks for being top new followers this week much appreciated happy want this
  share the love thanks for being top new followers this week happy want this

[1m[31m2[0m hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number cont
  hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number for us to cont

[1m[31m3[0m share the love thanks for being top new followers this week happy get it
  share the love thanks for being top new followers this week happy want it

[1m[31m4[0m thanks for the recent follow happy to connect happy have a great thursday get free
  thanks for the recent follow happy to connect happy want this it s free

[1m[31m5[0m thanks for the recent follow happy to connect happy have a great thursday want this
  thanks for the recent follow happy to connect happy have a great wednesday want this

[1m[31m6[0m thanks f

In [51]:
## Датафреймы для леммы word2vec и нахождение 10 пар похожих твитов
df_lemma_word2vec, df_lemma_word2vec_test, X_train_for_lemma_word2vec, y_train_for_lemma_word2vec = create_vectors(preparation_data['lemma'], 
                                                                                                                   preparation_data_test['lemma'], 100)
find_ten_pairs(df_lemma_word2vec.copy(), X_train_for_lemma_word2vec.copy())

[1m[31m1[0m hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number cont
  hi ashish we tried to call your number but got no response unhappy please share another suitable time and an alternate cont

[1m[31m2[0m i love when people or message me w long messages about how beautiful they find joon like unhappy yes i agree keep going
  happy birthday sweet sweet girl i hope you have the best day ever love and miss you so much unhappy

[1m[31m3[0m hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number for us to cont
  i hope louis gets all the off time that he wants and comes back fresh and happy happy

[1m[31m4[0m share the love thanks for being top new followers this week happy get it
  share the love thanks for being top new followers this week happy want it

[1m[31m5[0m thanks for the recent follow happy to connect happy have a great thursday want

In [52]:
## Датафреймы для стеммера word2vec с обработкой ошибок и нахождение 10 пар похожих твитов
df_miss_stemmer_word2vec, df_miss_stemmer_word2vec_test, X_train_for_miss_stemmer_word2vec, y_train_for_miss_stemmer_word2vec = create_vectors(preparation_data['miss_stemmer'], 
                                                                                                                                               preparation_data_test['miss_stemmer'], 100)
find_ten_pairs(df_miss_stemmer_word2vec.copy(), X_train_for_miss_stemmer_word2vec.copy())

[1m[31m1[0m feel like i shoyould be telling you to get the fuck out social media byout also feel really mean because unhappy silence love yoyou hope yoyoure okay
  happy birthday sweet sweet girl i hope you have the best day ever love and miss you so much unhappy

[1m[31m2[0m hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number cont
  hi ashish we tried to call your number but got no response unhappy please share another suitable time and an alternate cont

[1m[31m3[0m thanks for the recent follow happy to connect happy have a great thursday
  thanks for the recent follow happy to connect happy have a great wednesday

[1m[31m4[0m thanks for the recent follow happy to connect happy have a great thursday get free
  thanks for the recent follow happy to connect happy have a great thursday want this it s free

[1m[31m5[0m thanks for the recent follow happy to connect happy have a great thursday want this
  tha

In [53]:
## Датафреймы для леммы word2vec слов с обработкой ошибок и нахождение 10 пар похожих твитов
df_miss_lemma_word2vec, df_miss_lemma_word2vec_test, X_train_for_miss_lemma_word2vec, y_train_for_miss_lemma_word2vec = create_vectors(preparation_data['miss_lemma'], 
                                                                                                                                       preparation_data_test['miss_lemma'], 100)
find_ten_pairs(df_miss_lemma_word2vec.copy(), X_train_for_miss_lemma_word2vec.copy())

[1m[31m1[0m feel like i shoyould be telling you to get the fuck out social media byout also feel really mean because unhappy silence love yoyou hope yoyoure okay
  happy birthday sweet sweet girl i hope you have the best day ever love and miss you so much unhappy

[1m[31m2[0m hi we tried to call your number but got no response unhappy please share another suitable time and an alternate number cont
  hi ashish we tried to call your number but got no response unhappy please share another suitable time and an alternate cont

[1m[31m3[0m thanks for the recent follow happy to connect happy have a great thursday
  thanks for the recent follow happy to connect happy have a great wednesday

[1m[31m4[0m thanks for the recent follow happy to connect happy have a great thursday get free
  thanks for the recent follow happy to connect happy have a great thursday want this it s free

[1m[31m5[0m thanks for the recent follow happy to connect happy have a great thursday want this
  tha

## Предсказательные модели

In [54]:
## Создаём списки из полученных ранее датафреймов. Первые для мешков + TF-IDF, вторые для word2vec
trains = [df_token_bag, df_stemmer_bag, df_lemma_bag, df_miss_stemmer_bag, df_miss_lemma_bag, df_token_bag_binary, df_stemmer_bag_binary,  df_lemma_bag_binary, 
          df_miss_stemmer_bag_binary, df_miss_lemma_bag_binary, df_token_tfidf, df_stemmer_tfidf, df_lemma_tfidf, df_miss_stemmer_tfidf, df_miss_lemma_tfidf]
tests = [df_token_bag_test, df_stemmer_bag_test, df_lemma_bag_test, df_miss_stemmer_bag_test, df_miss_lemma_bag_test, df_token_bag_binary_test, 
         df_stemmer_bag_binary_test, df_lemma_bag_binary_test, df_miss_stemmer_bag_binary_test, df_miss_lemma_bag_binary_test, df_token_tfidf_test, 
         df_stemmer_tfidf_test, df_lemma_tfidf_test, df_miss_stemmer_tfidf_test, df_miss_lemma_tfidf_test]
y_y = [y_train_for_token_bag, y_train_for_stemmer_bag, y_train_for_lemma_bag, y_train_for_miss_stemmer_bag, y_train_for_miss_lemma_bag, 
       y_train_for_token_bag_binary, y_train_for_stemmer_bag_binary, y_train_for_lemma_bag_binary, y_train_for_miss_stemmer_bag_binary, 
       y_train_for_miss_lemma_bag_binary, y_train_for_token_tfidf, y_train_for_stemmer_tfidf, y_train_for_lemma_tfidf, y_train_for_miss_stemmer_tfidf, 
       y_train_for_miss_lemma_tfidf]
names = ['токены, мешки слов со стоп-словами', 'стеммер, мешки слов со стоп-словами', 'лемма, мешки слов со стоп-словами', 
         'стеммер без ошибок, мешки слов со стоп-словами', 'лемма без ошибок, мешки слов со стоп-словами', 'токены, бинарные мешки слов со стоп-словами', 
         'стеммер, бинарные мешки слов со стоп-словами', 'лемма, бинарные мешки слов со стоп-словами', 'стеммер без ошибок, бинарные мешки слов со стоп-словами', 
         'лемма без ошибок, бинарные мешки слов со стоп-словами', 'токены, tf-idf со стоп-словами', 'стеммер, tf-idf со стоп-словами', 
         'лемма, tf-idf со стоп-словами', 'стеммер без ошибок, tf-idf со стоп-словами', 'лемма без ошибок, tf-idf со стоп-словами']
trains_word2vec = [df_token_word2vec, df_stemmer_word2vec, df_lemma_word2vec, df_miss_stemmer_word2vec, df_miss_lemma_word2vec]
tests_word2vec = [df_token_word2vec_test, df_stemmer_word2vec_test, df_lemma_word2vec_test, df_miss_stemmer_word2vec_test, df_miss_lemma_word2vec_test]
y_y_word2vec = [y_train_for_token_word2vec, y_train_for_stemmer_word2vec, y_train_for_lemma_word2vec, y_train_for_miss_stemmer_word2vec, 
                y_train_for_miss_lemma_word2vec]
names_word2vec = ['токены, word2vec со стоп-словами', 'стеммер, word2vec со стоп-словами', 'лемма, word2vec со стоп-словами', 
                  'стеммер без ошибок, word2vec со стоп-словами', 'лемма без ошибок, word2vec со стоп-словами']


In [55]:
## Функция обучения модели предсказания и подсчёта его точности
def go_model(model_object, trains, tests, y_y, names):
    for i in range(len(trains)):
        model = model_object
        model.fit(trains[i], y_y[i])
        predict = model.predict(tests[i])
        print(f'{colored(round(accuracy_score(y_test, predict), 3), "red", attrs=["bold"])} \t {names[i]}')

#### Обучение мешков слов и TF-IDF

In [56]:
go_model(LogisticRegression(random_state = 21, fit_intercept= True, C=1.25), trains, tests, y_y, names)

[1m[31m0.885[0m 	 токены, мешки слов со стоп-словами
[1m[31m0.883[0m 	 стеммер, мешки слов со стоп-словами
[1m[31m0.874[0m 	 лемма, мешки слов со стоп-словами
[1m[31m0.883[0m 	 стеммер без ошибок, мешки слов со стоп-словами
[1m[31m0.883[0m 	 лемма без ошибок, мешки слов со стоп-словами
[1m[31m0.884[0m 	 токены, бинарные мешки слов со стоп-словами
[1m[31m0.884[0m 	 стеммер, бинарные мешки слов со стоп-словами
[1m[31m0.874[0m 	 лемма, бинарные мешки слов со стоп-словами
[1m[31m0.885[0m 	 стеммер без ошибок, бинарные мешки слов со стоп-словами
[1m[31m0.885[0m 	 лемма без ошибок, бинарные мешки слов со стоп-словами
[1m[31m0.877[0m 	 токены, tf-idf со стоп-словами
[1m[31m0.884[0m 	 стеммер, tf-idf со стоп-словами
[1m[31m0.877[0m 	 лемма, tf-idf со стоп-словами
[1m[31m0.881[0m 	 стеммер без ошибок, tf-idf со стоп-словами
[1m[31m0.881[0m 	 лемма без ошибок, tf-idf со стоп-словами


In [57]:
go_model(DecisionTreeClassifier(max_depth=35, random_state=42), trains, tests, y_y, names)

[1m[31m0.865[0m 	 токены, мешки слов со стоп-словами
[1m[31m0.866[0m 	 стеммер, мешки слов со стоп-словами
[1m[31m0.863[0m 	 лемма, мешки слов со стоп-словами
[1m[31m0.871[0m 	 стеммер без ошибок, мешки слов со стоп-словами
[1m[31m0.871[0m 	 лемма без ошибок, мешки слов со стоп-словами
[1m[31m0.866[0m 	 токены, бинарные мешки слов со стоп-словами
[1m[31m0.866[0m 	 стеммер, бинарные мешки слов со стоп-словами
[1m[31m0.863[0m 	 лемма, бинарные мешки слов со стоп-словами
[1m[31m0.87[0m 	 стеммер без ошибок, бинарные мешки слов со стоп-словами
[1m[31m0.87[0m 	 лемма без ошибок, бинарные мешки слов со стоп-словами
[1m[31m0.861[0m 	 токены, tf-idf со стоп-словами
[1m[31m0.863[0m 	 стеммер, tf-idf со стоп-словами
[1m[31m0.866[0m 	 лемма, tf-idf со стоп-словами
[1m[31m0.862[0m 	 стеммер без ошибок, tf-idf со стоп-словами
[1m[31m0.862[0m 	 лемма без ошибок, tf-idf со стоп-словами


In [58]:
go_model(RandomForestClassifier(max_depth=40, random_state=42, n_estimators=25, min_samples_split=5), trains, tests, y_y, names)

[1m[31m0.868[0m 	 токены, мешки слов со стоп-словами
[1m[31m0.871[0m 	 стеммер, мешки слов со стоп-словами
[1m[31m0.87[0m 	 лемма, мешки слов со стоп-словами
[1m[31m0.865[0m 	 стеммер без ошибок, мешки слов со стоп-словами
[1m[31m0.865[0m 	 лемма без ошибок, мешки слов со стоп-словами
[1m[31m0.859[0m 	 токены, бинарные мешки слов со стоп-словами
[1m[31m0.868[0m 	 стеммер, бинарные мешки слов со стоп-словами
[1m[31m0.865[0m 	 лемма, бинарные мешки слов со стоп-словами
[1m[31m0.865[0m 	 стеммер без ошибок, бинарные мешки слов со стоп-словами
[1m[31m0.865[0m 	 лемма без ошибок, бинарные мешки слов со стоп-словами
[1m[31m0.857[0m 	 токены, tf-idf со стоп-словами
[1m[31m0.867[0m 	 стеммер, tf-idf со стоп-словами
[1m[31m0.858[0m 	 лемма, tf-idf со стоп-словами
[1m[31m0.858[0m 	 стеммер без ошибок, tf-idf со стоп-словами
[1m[31m0.858[0m 	 лемма без ошибок, tf-idf со стоп-словами


In [59]:
go_model(knn(n_neighbors=13), trains, tests, y_y, names)

[1m[31m0.834[0m 	 токены, мешки слов со стоп-словами
[1m[31m0.846[0m 	 стеммер, мешки слов со стоп-словами
[1m[31m0.787[0m 	 лемма, мешки слов со стоп-словами
[1m[31m0.805[0m 	 стеммер без ошибок, мешки слов со стоп-словами
[1m[31m0.805[0m 	 лемма без ошибок, мешки слов со стоп-словами
[1m[31m0.815[0m 	 токены, бинарные мешки слов со стоп-словами
[1m[31m0.814[0m 	 стеммер, бинарные мешки слов со стоп-словами
[1m[31m0.788[0m 	 лемма, бинарные мешки слов со стоп-словами
[1m[31m0.822[0m 	 стеммер без ошибок, бинарные мешки слов со стоп-словами
[1m[31m0.822[0m 	 лемма без ошибок, бинарные мешки слов со стоп-словами
[1m[31m0.761[0m 	 токены, tf-idf со стоп-словами
[1m[31m0.805[0m 	 стеммер, tf-idf со стоп-словами
[1m[31m0.783[0m 	 лемма, tf-idf со стоп-словами
[1m[31m0.754[0m 	 стеммер без ошибок, tf-idf со стоп-словами
[1m[31m0.754[0m 	 лемма без ошибок, tf-idf со стоп-словами


In [60]:
go_model(SVC(kernel='sigmoid', C=1.8, max_iter=500, random_state=0), trains, tests, y_y, names)

[1m[31m0.872[0m 	 токены, мешки слов со стоп-словами
[1m[31m0.871[0m 	 стеммер, мешки слов со стоп-словами
[1m[31m0.865[0m 	 лемма, мешки слов со стоп-словами
[1m[31m0.874[0m 	 стеммер без ошибок, мешки слов со стоп-словами
[1m[31m0.874[0m 	 лемма без ошибок, мешки слов со стоп-словами
[1m[31m0.871[0m 	 токены, бинарные мешки слов со стоп-словами
[1m[31m0.871[0m 	 стеммер, бинарные мешки слов со стоп-словами
[1m[31m0.87[0m 	 лемма, бинарные мешки слов со стоп-словами
[1m[31m0.871[0m 	 стеммер без ошибок, бинарные мешки слов со стоп-словами
[1m[31m0.871[0m 	 лемма без ошибок, бинарные мешки слов со стоп-словами
[1m[31m0.883[0m 	 токены, tf-idf со стоп-словами
[1m[31m0.876[0m 	 стеммер, tf-idf со стоп-словами
[1m[31m0.875[0m 	 лемма, tf-idf со стоп-словами
[1m[31m0.881[0m 	 стеммер без ошибок, tf-idf со стоп-словами
[1m[31m0.881[0m 	 лемма без ошибок, tf-idf со стоп-словами


In [61]:
go_model(GradientBoostingClassifier(learning_rate=0.5, max_depth=2, random_state=0, tol=0.1), trains, tests, y_y, names)

[1m[31m0.866[0m 	 токены, мешки слов со стоп-словами
[1m[31m0.859[0m 	 стеммер, мешки слов со стоп-словами
[1m[31m0.871[0m 	 лемма, мешки слов со стоп-словами
[1m[31m0.861[0m 	 стеммер без ошибок, мешки слов со стоп-словами
[1m[31m0.861[0m 	 лемма без ошибок, мешки слов со стоп-словами
[1m[31m0.865[0m 	 токены, бинарные мешки слов со стоп-словами
[1m[31m0.862[0m 	 стеммер, бинарные мешки слов со стоп-словами
[1m[31m0.868[0m 	 лемма, бинарные мешки слов со стоп-словами
[1m[31m0.859[0m 	 стеммер без ошибок, бинарные мешки слов со стоп-словами
[1m[31m0.859[0m 	 лемма без ошибок, бинарные мешки слов со стоп-словами
[1m[31m0.857[0m 	 токены, tf-idf со стоп-словами
[1m[31m0.861[0m 	 стеммер, tf-idf со стоп-словами
[1m[31m0.859[0m 	 лемма, tf-idf со стоп-словами
[1m[31m0.859[0m 	 стеммер без ошибок, tf-idf со стоп-словами
[1m[31m0.859[0m 	 лемма без ошибок, tf-idf со стоп-словами


#### Обучение word2vec

In [62]:
go_model(LogisticRegression(random_state = 21, C=5, solver='newton-cg'), trains_word2vec, tests_word2vec, y_y_word2vec, names_word2vec)

[1m[31m0.839[0m 	 токены, word2vec со стоп-словами
[1m[31m0.827[0m 	 стеммер, word2vec со стоп-словами
[1m[31m0.825[0m 	 лемма, word2vec со стоп-словами
[1m[31m0.852[0m 	 стеммер без ошибок, word2vec со стоп-словами
[1m[31m0.852[0m 	 лемма без ошибок, word2vec со стоп-словами


In [63]:
go_model(DecisionTreeClassifier(max_depth=7, random_state=42, min_samples_split=3), trains_word2vec, tests_word2vec, y_y_word2vec, names_word2vec)

[1m[31m0.756[0m 	 токены, word2vec со стоп-словами
[1m[31m0.729[0m 	 стеммер, word2vec со стоп-словами
[1m[31m0.707[0m 	 лемма, word2vec со стоп-словами
[1m[31m0.75[0m 	 стеммер без ошибок, word2vec со стоп-словами
[1m[31m0.746[0m 	 лемма без ошибок, word2vec со стоп-словами


In [64]:
go_model(RandomForestClassifier(max_depth=18, random_state=42, n_estimators=24, min_samples_split=5), trains_word2vec, tests_word2vec, y_y_word2vec, names_word2vec)

[1m[31m0.815[0m 	 токены, word2vec со стоп-словами
[1m[31m0.809[0m 	 стеммер, word2vec со стоп-словами
[1m[31m0.805[0m 	 лемма, word2vec со стоп-словами
[1m[31m0.815[0m 	 стеммер без ошибок, word2vec со стоп-словами
[1m[31m0.822[0m 	 лемма без ошибок, word2vec со стоп-словами


In [65]:
go_model(knn(n_neighbors=10, weights='distance', p=5), trains_word2vec, tests_word2vec, y_y_word2vec, names_word2vec)

[1m[31m0.825[0m 	 токены, word2vec со стоп-словами
[1m[31m0.778[0m 	 стеммер, word2vec со стоп-словами
[1m[31m0.805[0m 	 лемма, word2vec со стоп-словами
[1m[31m0.835[0m 	 стеммер без ошибок, word2vec со стоп-словами
[1m[31m0.839[0m 	 лемма без ошибок, word2vec со стоп-словами


In [66]:
go_model(SVC(kernel='rbf', C=4.1, max_iter=510, random_state=0), trains_word2vec, tests_word2vec, y_y_word2vec, names_word2vec)

[1m[31m0.855[0m 	 токены, word2vec со стоп-словами
[1m[31m0.837[0m 	 стеммер, word2vec со стоп-словами
[1m[31m0.83[0m 	 лемма, word2vec со стоп-словами
[1m[31m0.861[0m 	 стеммер без ошибок, word2vec со стоп-словами
[1m[31m0.863[0m 	 лемма без ошибок, word2vec со стоп-словами


In [67]:
go_model(GradientBoostingClassifier(learning_rate=0.5, max_depth=3, random_state=0, tol=0.1), trains_word2vec, tests_word2vec, y_y_word2vec, names_word2vec)

[1m[31m0.837[0m 	 токены, word2vec со стоп-словами
[1m[31m0.835[0m 	 стеммер, word2vec со стоп-словами
[1m[31m0.815[0m 	 лемма, word2vec со стоп-словами
[1m[31m0.837[0m 	 стеммер без ошибок, word2vec со стоп-словами
[1m[31m0.828[0m 	 лемма без ошибок, word2vec со стоп-словами
