In [1]:
%pylab

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


In [97]:
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, precision_score, recall_score

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer

In [4]:
%matplotlib inline

# Загрузка данных

In [5]:
with open('english_big.txt', 'r') as f:
    raw_data = f.read().splitlines()

In [6]:
print(len(raw_data))

1324


In [7]:
labels = {
    'spam': 1,
    'ham': 0
}

In [8]:
sms_messages = []
sms_targets = []
for sms in raw_data:
    splitted = sms.split(',')
    sms_messages.append(','.join(splitted[:-1]))
    sms_targets.append(labels.get(splitted[-1]))

* sms_messages - сырые данные текстов сообщений, требуют дальнейшей обработки
* sms_targets - метки спам или не спам, дельнейшей обработки не требуют

In [9]:
# Доля спам-сообщений об общего числа
sum(sms_targets) / len(sms_targets)

0.243202416918429

# Препроцессинг

## Удаление стоп-слов и другая очистка + стэмминг

In [10]:
#tokenizer = RegexpTokenizer(r'[a-z]+')
tokenizer = RegexpTokenizer(r'\w+')
stemmer = SnowballStemmer('english')
sw = stopwords.words('english')

def nltk_preprocess(sentence):
    sentence = sentence.lower()
    tokens = tokenizer.tokenize(sentence)
    filtered_words = [stemmer.stem(word) for word in tokens if word not in sw and len(word) >= 3]
    return ' '.join(filtered_words)

In [11]:
sms_messages = [nltk_preprocess(sms) for sms in sms_messages]

In [12]:
sms_messages[0]

'urgent call 09061749602 landlin complimentari tenerif holiday ј10 000 cash await collect sae box 528 hp20 1yf 150ppm'

# Векторизация

In [13]:
vectorizer = CountVectorizer()
sms_vectorized = vectorizer.fit_transform(sms_messages)
sms_vectorized.shape

(1324, 2906)

# Классификация

In [15]:
X_train, X_test, y_train, y_test = train_test_split(sms_vectorized.toarray(), sms_targets, test_size=0.2, random_state=42)

## Поиск наилучших параметров, используя кросс валидацию

In [92]:
pipe = Pipeline([
    ('clf', BernoulliNB())
])

params = [
    {
        'clf': [GaussianNB()],
    },
    {
        'clf': [BernoulliNB(), MultinomialNB()],
        'clf__alpha': [0.001, 0.01, 0.05, 0.1, 0.15, 0.2, 0.5, 0.9, 1.0]
    }
]

grid_cv = GridSearchCV(pipe, params, scoring='accuracy', cv=10, return_train_score=False)
grid_cv.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('clf', BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'clf': [GaussianNB(priors=None)]}, {'clf': [BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True), MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)], 'clf__alpha': [0.001, 0.01, 0.05, 0.1, 0.15, 0.2, 0.5, 0.9, 1.0]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='accuracy', verbose=0)

## Алгоритм с наилучшим показателем точности при кросс-валидации

In [96]:
print('Алгоритм: {}'.format(grid_cv.best_estimator_.steps[0][1]))
print('Accuracy при кросс-валидации: {}'.format(grid_cv.best_score_))

Алгоритм: BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True)
Accuracy при кросс-валидации: 0.9933899905571294


## Точность работы алгоритма на тестовой выборке

### Accuracy

In [90]:
grid_cv.best_estimator_.score(X_test, y_test)

0.99622641509433962

### Precision

In [101]:
precision_score(y_test, grid_cv.best_estimator_.predict(X_test))

0.9850746268656716

### Recall

In [102]:
recall_score(y_test, grid_cv.best_estimator_.predict(X_test))

1.0

### F1-score

In [91]:
f1_score(y_test, grid_cv.best_estimator_.predict(X_test))

0.99248120300751874