In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

### Разбивка данных на трейн и тест

In [2]:
data = pd.read_csv("Spam_SMS.csv")

In [3]:
data

Unnamed: 0,Class,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5569,spam,This is the 2nd time we have tried 2 contact u...
5570,ham,Will ü b going to esplanade fr home?
5571,ham,"Pity, * was in mood for that. So...any other s..."
5572,ham,The guy did some bitching but I acted like i'd...


In [5]:
data["target"] = LabelEncoder().fit_transform(data.Class)

In [7]:
X = data["Message"]
y = data["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42)

### Обучение модели
Для того, чтобы обучить базовые модели на текстовых данных можно воспользоваться:
1. TF-IDF
2. Bag-of-words
3. Нейросетевые алгоритмы (w2v) + усреднение

Для лучшей работы:
1. Добавляем токенизацию (обрабатываем посимвольно или по словам)
2. Удаление лишних символов
3. Приведение к нормальной форме

Поверх этого накидываем классические алгоритмы (Бустинг, логрег и т.д,)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [17]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X_train_transformed = vectorizer.transform(X_train).toarray()
X_test_transformed = vectorizer.transform(X_test).toarray()

In [20]:
X_train_transformed.shape

(4180, 8713)

In [21]:
X_test_transformed.shape

(1394, 8713)

In [22]:
model = LogisticRegression()
model.fit(X_train_transformed, y_train)
preds = model.predict_proba(X_test_transformed)[:, 1]

In [23]:
roc_auc_score(y_test, preds)

0.9886126742480622

In [24]:
f1_score(y_test, (preds >= 0.5).astype(int))

0.8389057750759878

In [25]:
print(classification_report(y_test, (preds >= 0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1203
           1       1.00      0.72      0.84       191

    accuracy                           0.96      1394
   macro avg       0.98      0.86      0.91      1394
weighted avg       0.96      0.96      0.96      1394



### Поиграем с токенизацией

In [28]:
pipe = Pipeline([
    ("vectorizer", TfidfVectorizer(analyzer="word", ngram_range=(1, 3))),
    ("model", LogisticRegression())
])
pipe.fit(X_train, y_train)
preds = pipe.predict_proba(X_test)[:, 1]
print(classification_report(y_test, (preds >= 0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1203
           1       1.00      0.66      0.79       191

    accuracy                           0.95      1394
   macro avg       0.97      0.83      0.88      1394
weighted avg       0.96      0.95      0.95      1394



In [29]:
pipe = Pipeline([
    ("vectorizer", TfidfVectorizer(analyzer="word", ngram_range=(1, 1))),
    ("model", LogisticRegression())
])
pipe.fit(X_train, y_train)
preds = pipe.predict_proba(X_test)[:, 1]
print(classification_report(y_test, (preds >= 0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1203
           1       0.99      0.80      0.88       191

    accuracy                           0.97      1394
   macro avg       0.98      0.90      0.93      1394
weighted avg       0.97      0.97      0.97      1394



In [31]:
pipe = Pipeline([
    ("vectorizer", TfidfVectorizer(analyzer="char", ngram_range=(1, 4))),
    ("model", LogisticRegression())
])
pipe.fit(X_train, y_train)
preds = pipe.predict_proba(X_test)[:, 1]
print(classification_report(y_test, (preds >= 0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1203
           1       1.00      0.86      0.93       191

    accuracy                           0.98      1394
   macro avg       0.99      0.93      0.96      1394
weighted avg       0.98      0.98      0.98      1394



### Попробуем удалить избыточную информацию

In [33]:
pipe = Pipeline(
    [
        ("vectorizer", TfidfVectorizer(analyzer="char", ngram_range=(1, 4), max_features=500)),
        ("model", LogisticRegression()),
    ]
)
pipe.fit(X_train, y_train)
preds = pipe.predict_proba(X_test)[:, 1]
print(classification_report(y_test, (preds >= 0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1203
           1       0.99      0.87      0.93       191

    accuracy                           0.98      1394
   macro avg       0.98      0.94      0.96      1394
weighted avg       0.98      0.98      0.98      1394



### Попробуем воспользоваться BOW

In [41]:
pipe = Pipeline(
    [
        ("vectorizer", TfidfVectorizer(analyzer="char", ngram_range=(1, 4), max_features=500, use_idf=False)),
        ("model", LogisticRegression()),
    ]
)
pipe.fit(X_train, y_train)
preds = pipe.predict_proba(X_test)[:, 1]
print(classification_report(y_test, (preds >= 0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1203
           1       1.00      0.77      0.87       191

    accuracy                           0.97      1394
   macro avg       0.98      0.88      0.93      1394
weighted avg       0.97      0.97      0.97      1394



In [42]:
pipe = Pipeline([
    ("vectorizer", CountVectorizer(analyzer="char", ngram_range=(1, 4), max_features=500)),
    ("model", LogisticRegression())
])
pipe.fit(X_train, y_train)
preds = pipe.predict_proba(X_test)[:, 1]
print(classification_report(y_test, (preds >= 0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1203
           1       0.97      0.94      0.95       191

    accuracy                           0.99      1394
   macro avg       0.98      0.97      0.97      1394
weighted avg       0.99      0.99      0.99      1394



In [None]:
### Предобработка

In [91]:
import nltk

nltk.download("wordnet")

from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /home/vvh413/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [105]:
from nltk.corpus import stopwords
len(stopwords.words())

10885

In [96]:
X_train_prep = X_train.apply(lambda x: " ".join([WordNetLemmatizer().lemmatize(w) for w in x.lower().split()]))
X_test_prep = X_test.apply(lambda x: " ".join([WordNetLemmatizer().lemmatize(w) for w in x.lower().split()]))

In [98]:
pipe = Pipeline([
    ("vectorizer", TfidfVectorizer(analyzer="word", ngram_range=(1, 1))),
    ("model", LogisticRegression())
])
pipe.fit(X_train_prep, y_train)
preds = pipe.predict_proba(X_test_prep)[:, 1]
print(classification_report(y_test, (preds >= 0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1203
           1       0.99      0.80      0.88       191

    accuracy                           0.97      1394
   macro avg       0.98      0.90      0.93      1394
weighted avg       0.97      0.97      0.97      1394



In [44]:
import gensim.downloader as api
from gensim.models import FastText, Word2Vec

In [45]:
wv = api.load("word2vec-google-news-300")

In [46]:
wv.most_similar("cat")

[('cats', 0.8099379539489746),
 ('dog', 0.760945737361908),
 ('kitten', 0.7464985251426697),
 ('feline', 0.7326234579086304),
 ('beagle', 0.7150582671165466),
 ('puppy', 0.7075453400611877),
 ('pup', 0.6934291124343872),
 ('pet', 0.6891531348228455),
 ('felines', 0.6755931973457336),
 ('chihuahua', 0.6709762215614319)]

In [47]:
def cossim(v1, v2):
    return v1.dot(v2) / (np.sqrt(np.sum(v1**2)) * np.sqrt(np.sum(v2**2)))

In [49]:
import numpy as np
cossim(wv["king"], wv["man"]), cossim(wv["queen"], wv["woman"]), cossim(wv["king"], wv["queen"])

(0.22942673, 0.3161814, 0.65109557)

In [50]:
cossim(wv["king"] - wv["man"] + wv["woman"], wv["queen"])

0.73005176

In [None]:
np.mean([
    wv.get_vector(w, norm=True) for w in X_train.iloc[0].lower().split() if w in wv.key_to_index], 
    axis=0
) == wv.get_mean_vector(X_train.iloc[0].lower().split())

In [85]:
X_train_vectorized = np.array(X_train.apply(lambda row: wv.get_mean_vector(row.lower().split())).tolist())
X_test_vectorized = np.array(X_test.apply(lambda row: wv.get_mean_vector(row.lower().split())).tolist())

In [87]:
X_train_vectorized.shape, X_test_vectorized.shape

((4180, 300), (1394, 300))

In [89]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)
preds = model.predict(X_test_vectorized)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.92      0.99      0.96      1203
           1       0.88      0.48      0.62       191

    accuracy                           0.92      1394
   macro avg       0.90      0.74      0.79      1394
weighted avg       0.92      0.92      0.91      1394

