### Импорт необходимых библиотек

In [1]:
import pandas as pd # для чтения датасета
import re # для обработки текста
from sklearn.utils import resample # Для апсемплинга
from sklearn.model_selection import train_test_split # для разделения выборок на трейн и тест
from sklearn.feature_extraction.text import TfidfVectorizer # tf-idf
from sklearn.preprocessing import PolynomialFeatures # для полиномиальных признаков
from sklearn.decomposition import TruncatedSVD # SVD
from sklearn.metrics import classification_report # Для просмотра результата работы модели
from sklearn.linear_model import LogisticRegression # альтмодель
from sklearn.naive_bayes import GaussianNB # наивный байес

#### Чтение данных

In [3]:
data = pd.read_csv('train.csv', usecols=['tweet','label'])

#### Предобработка данных

In [1]:
def clean_text(text):
    reg = text.lower()
    # reg = re.sub(r"(.)\1+", r"\1", reg)
    reg =  re.sub(r'\!', ' VOSKZNAK ', reg)
    reg =  re.sub(r'\.', ' POINT ', reg)
    reg =  re.sub(r'\?', ' QUESTION ', reg)
    reg = re.sub('\d+', ' NUMBER ', reg)
    reg = re.sub('\n', ' ', reg)
    reg = re.sub('\)', ' HAPPYSMILE ', reg)
    reg = re.sub('\(', ' DADSMILE ', reg)
    reg = re.sub('[-#$%@^&*(){}|_:;.,/?~<>]', '', reg)
    reg = re.sub('[^a-zA-Z ]', '', reg)
    reg = re.sub(" +", " ", reg)
    reg = re.sub(r'(^\W+)|(\W+$)', '', reg)
    return reg

In [176]:
data.tweet = data.tweet.apply(clean_text)

#### деление на трейн и тест и апсемплинг

In [177]:
train, test = train_test_split(data, test_size=0.25)

In [178]:
train_zeros = train[train.label==0]
train_ones = train[train.label==1]

In [179]:
train_ones = resample(train_ones, n_samples=len(train_zeros))

In [180]:
train = pd.concat((train_ones, train_zeros)).sample(frac=1)

In [181]:
X_train, X_test, y_train, y_test = train.tweet, test.tweet, train.label, test.label

#### Обучение модели на отлично

In [182]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3), max_features=800)
poly = PolynomialFeatures(2)
svd = TruncatedSVD(n_components=100, n_iter=10)

In [183]:
# применение tf-idf
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [185]:
# смотрим работоспособность модели без полиномиальных признаков и SVD
model = GaussianNB().fit(X_train.toarray(), y_train)
print(classification_report(y_test, model.predict(X_test.toarray())))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      7431
           1       0.64      0.28      0.39       560

    accuracy                           0.94      7991
   macro avg       0.79      0.63      0.68      7991
weighted avg       0.93      0.94      0.93      7991



In [186]:
# Полиномиальные признаки
poly.fit(X_train)
X_train = poly.transform(X_train)
X_test = poly.transform(X_test)

In [188]:
# SVD
svd.fit(X_train)
X_train = svd.transform(X_train)
X_test = svd.transform(X_test)

In [189]:
# Смотрим результаты
model = GaussianNB().fit(X_train, y_train)
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.95      0.92      0.93      7431
           1       0.23      0.30      0.26       560

    accuracy                           0.88      7991
   macro avg       0.59      0.61      0.60      7991
weighted avg       0.90      0.88      0.89      7991



### Альтмодель, показавшая лучшие результаты

In [190]:
X_train, X_test, y_train, y_test = train.tweet, test.tweet, train.label, test.label

In [191]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(2, 3), min_df=2, max_df=0.8)

In [192]:
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [193]:
model = LogisticRegression(max_iter=200).fit(X_train, y_train)
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      7431
           1       0.84      0.45      0.59       560

    accuracy                           0.96      7991
   macro avg       0.90      0.72      0.78      7991
weighted avg       0.95      0.96      0.95      7991

