In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import roc_auc_score, f1_score, classification_report
from sklearn.pipeline import Pipeline

### Разбивка данных на трейн и тест

In [35]:
data = pd.read_csv('Spam_SMS.csv')

In [36]:
data

Unnamed: 0,Class,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5569,spam,This is the 2nd time we have tried 2 contact u...
5570,ham,Will ü b going to esplanade fr home?
5571,ham,"Pity, * was in mood for that. So...any other s..."
5572,ham,The guy did some bitching but I acted like i'd...


In [37]:
data['target'] = LabelEncoder().fit_transform(data.Class)

In [38]:
X = data['Message']
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42)

### Обучение модели
Для того, чтобы обучить базовые модели на текстовых данных можно воспользоваться:
1. TF-IDF
2. Bag-of-words
3. Нейросетевые алгоритмы (w2v) + усреднение

Для лучшей работы:
1. Добавляем токенизацию (обрабатываем посимвольно или по словам)
2. Удаление лишних символов
3. Приведение к нормальной форме

Поверх этого накидываем классические алгоритмы (Бустинг, логрег и т.д,)

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [40]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X_train_transformed = vectorizer.transform(X_train).toarray()
X_test_transformed = vectorizer.transform(X_test).toarray()

In [41]:
X_train_transformed.shape

(4180, 8713)

In [42]:
X_test_transformed.shape

(1394, 8713)

In [43]:
model = LogisticRegression()
model.fit(X_train_transformed, y_train)
preds = model.predict_proba(X_test_transformed)[:, 1]

In [44]:
roc_auc_score(y_test, preds)

0.9886126742480622

In [45]:
f1_score(y_test, (preds>=0.5).astype(int))

0.8389057750759878

In [46]:
print(classification_report(y_test, (preds>=0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1203
           1       1.00      0.72      0.84       191

    accuracy                           0.96      1394
   macro avg       0.98      0.86      0.91      1394
weighted avg       0.96      0.96      0.96      1394



### Поиграем с токенизацией

In [52]:
pipe = Pipeline([
    ('vectorizer', TfidfVectorizer(analyzer='word', ngram_range=(1, 3))),
    ('model',  LogisticRegression())
    ]
)
pipe.fit(X_train, y_train)
preds = pipe.predict_proba(X_test)[:, 1]
print(classification_report(y_test, (preds>=0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1203
           1       1.00      0.66      0.79       191

    accuracy                           0.95      1394
   macro avg       0.97      0.83      0.88      1394
weighted avg       0.96      0.95      0.95      1394



In [53]:
pipe = Pipeline([
    ('vectorizer', TfidfVectorizer(analyzer='word', ngram_range=(1, 1))),
    ('model',  LogisticRegression())
    ]
)
pipe.fit(X_train, y_train)
preds = pipe.predict_proba(X_test)[:, 1]
print(classification_report(y_test, (preds>=0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1203
           1       0.99      0.80      0.88       191

    accuracy                           0.97      1394
   macro avg       0.98      0.90      0.93      1394
weighted avg       0.97      0.97      0.97      1394



In [54]:
pipe = Pipeline([
    ('vectorizer', TfidfVectorizer(analyzer='char', ngram_range=(1, 4))),
    ('model',  LogisticRegression())
    ]
)
pipe.fit(X_train, y_train)
preds = pipe.predict_proba(X_test)[:, 1]
print(classification_report(y_test, (preds>=0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1203
           1       1.00      0.86      0.93       191

    accuracy                           0.98      1394
   macro avg       0.99      0.93      0.96      1394
weighted avg       0.98      0.98      0.98      1394



### Попробуем удалить избыточную информацию

In [58]:
pipe = Pipeline([
    ('vectorizer', TfidfVectorizer(analyzer='char', ngram_range=(1, 4), max_features=500)),
    ('model',  LogisticRegression())
    ]
)
pipe.fit(X_train, y_train)
preds = pipe.predict_proba(X_test)[:, 1]
print(classification_report(y_test, (preds>=0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1203
           1       0.99      0.87      0.93       191

    accuracy                           0.98      1394
   macro avg       0.98      0.94      0.96      1394
weighted avg       0.98      0.98      0.98      1394



### Попробуем воспользоваться BOW

In [59]:
pipe = Pipeline([
    ('vectorizer', TfidfVectorizer(analyzer='char', ngram_range=(1, 4), max_features=500, use_idf=False)),
    ('model',  LogisticRegression())
    ]
)
pipe.fit(X_train, y_train)
preds = pipe.predict_proba(X_test)[:, 1]
print(classification_report(y_test, (preds>=0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1203
           1       1.00      0.77      0.87       191

    accuracy                           0.97      1394
   macro avg       0.98      0.88      0.93      1394
weighted avg       0.97      0.97      0.97      1394



In [61]:
### Предобработка

In [73]:
import nltk
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /Users/slavyan/nltk_data...


In [74]:
X_train

4044    I am literally in bed and have been up for lik...
2586         I will be outside office take all from there
5159                              K k:) sms chat with me.
585     So how's scotland. Hope you are not over showi...
4574    "URGENT! This is the 2nd attempt to contact U!...
                              ...                        
3772    Hi, wlcome back, did wonder if you got eaten b...
5191    ree entry in 2 a weekly comp for a chance to w...
5226    "OH FUCK. JUSWOKE UP IN A BED ON A BOATIN THE ...
5390             NOT MUCH NO FIGHTS. IT WAS A GOOD NITE!!
860               Did he just say somebody is named tampa
Name: Message, Length: 4180, dtype: object

In [None]:
X_train_prep = X_train.apply(lambda x: ' '.join([WordNetLemmatizer().lemmatize(w) for w in x.split()]))
X_test_prep = X_test.apply(lambda x: ' '.join([WordNetLemmatizer().lemmatize(w) for w in x.split()]))

4044    I am literally in bed and have been up for lik...
2586         I will be outside office take all from there
5159                               K k:) sm chat with me.
585     So how's scotland. Hope you are not over showi...
4574    "URGENT! This is the 2nd attempt to contact U!...
                              ...                        
3772    Hi, wlcome back, did wonder if you got eaten b...
5191    ree entry in 2 a weekly comp for a chance to w...
5226    "OH FUCK. JUSWOKE UP IN A BED ON A BOATIN THE ...
5390             NOT MUCH NO FIGHTS. IT WAS A GOOD NITE!!
860               Did he just say somebody is named tampa
Name: Message, Length: 4180, dtype: object