In [66]:
%pylab

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [206]:
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

In [94]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer

In [3]:
%matplotlib inline

# Загрузка данных

In [11]:
with open('english_big.txt', 'r') as f:
    raw_data = f.read().splitlines()

In [29]:
print(len(raw_data))

1324


In [14]:
labels = {
    'spam': 1,
    'ham': 0
}

In [297]:
sms_messages = []
sms_targets = []
for sms in raw_data:
    splitted = sms.split(',')
    sms_messages.append(','.join(splitted[:-1]))
    sms_targets.append(labels.get(splitted[-1]))

* sms_messages - сырые данные текстов сообщений, требуют дальнейшей обработки
* sms_targets - метки спам или не спам, дельнейшей обработки не требуют

In [212]:
# Доля спам-сообщений об общего числа
sum(sms_targets) / len(sms_targets)

0.243202416918429

# Препроцессинг

## Удаление стоп-слов и другая очистка + стэмминг

In [298]:
#tokenizer = RegexpTokenizer(r'[a-z]+')
tokenizer = RegexpTokenizer(r'\w+')
stemmer = SnowballStemmer('english')
sw = stopwords.words('english')

def nltk_preprocess(sentence):
    sentence = sentence.lower()
    tokens = tokenizer.tokenize(sentence)
    filtered_words = [stemmer.stem(word) for word in tokens if word not in sw and len(word) >= 3]
    return ' '.join(filtered_words)

In [299]:
sms_messages = [nltk_preprocess(sms) for sms in sms_messages]

In [300]:
sms_messages[0]

'urgent call 09061749602 landlin complimentari tenerif holiday ј10 000 cash await collect sae box 528 hp20 1yf 150ppm'

# Векторизация

In [308]:
vectorizer = CountVectorizer()
sms_vectorized = vectorizer.fit_transform(sms_messages)
sms_vectorized.shape

(1324, 2906)

In [301]:
vectorizer = TfidfVectorizer()
sms_vectorized = vectorizer.fit_transform(sms_messages)
sms_vectorized.shape

(1324, 2906)

# Классификация

## Поиск наилучших параметров

In [315]:
X_train, X_test, y_train, y_test = train_test_split(sms_vectorized.toarray(), sms_targets, test_size=0.2, random_state=42)

In [316]:
pipe = Pipeline([
    ('clf', None)
])

params = [
    {
        'clf': [GaussianNB()],
    },
    {
        'clf': [BernoulliNB(), MultinomialNB()],
        'clf__alpha': [0.1, 0.5, 1.0]
    }
]

grid_cv = GridSearchCV(pipe, params, scoring='accuracy', cv=10, return_train_score=False)
grid_cv.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None, steps=[('clf', None)]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'clf': [GaussianNB(priors=None)]}, {'clf': [BernoulliNB(alpha=0.1, binarize=0.0, class_prior=None, fit_prior=True), MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)], 'clf__alpha': [0.1, 0.5, 1.0]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='accuracy', verbose=0)

In [317]:
print(grid_cv.best_estimator_)
print(grid_cv.best_params_)
print(grid_cv.best_score_)

Pipeline(memory=None,
     steps=[('clf', BernoulliNB(alpha=0.1, binarize=0.0, class_prior=None, fit_prior=True))])
{'clf': BernoulliNB(alpha=0.1, binarize=0.0, class_prior=None, fit_prior=True), 'clf__alpha': 0.1}
0.991501416431


In [318]:
grid_cv.cv_results_['params']

[{'clf': GaussianNB(priors=None)},
 {'clf': BernoulliNB(alpha=0.1, binarize=0.0, class_prior=None, fit_prior=True),
  'clf__alpha': 0.1},
 {'clf': BernoulliNB(alpha=0.1, binarize=0.0, class_prior=None, fit_prior=True),
  'clf__alpha': 0.5},
 {'clf': BernoulliNB(alpha=0.1, binarize=0.0, class_prior=None, fit_prior=True),
  'clf__alpha': 1.0},
 {'clf': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
  'clf__alpha': 0.1},
 {'clf': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
  'clf__alpha': 0.5},
 {'clf': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
  'clf__alpha': 1.0}]

In [319]:
grid_cv.best_estimator_.score(X_test, y_test)

0.99245283018867925

In [320]:
sum(y_test) / len(y_test)

0.24905660377358491