In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from statistics import mean

In [2]:
dataset = pd.read_csv("./datasets/smsspamcollection/SMSSpamCollection.txt", sep = "\t", header = None)
dataset.columns = ["class", "text"]

In [3]:
dataset[:3]

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [4]:
dataset["class"] = np.where(dataset["class"] == "spam", 1, 0)
dataset[:3]

Unnamed: 0,class,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...


In [5]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(dataset["text"])

In [6]:
model = LogisticRegression()
scores = cross_val_score(model, X, dataset["class"], cv = 10, scoring = "f1_macro", n_jobs = -1)
mean(scores)

0.96143988312323592

In [7]:
fullModel = LogisticRegression()
fullModel.fit(X, dataset["class"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [8]:
testTexts = ["FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB",
             "FreeMsg: Txt: claim your reward of 3 hours talk time",
             "Have you visited the last lecture on physics?",
             "Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$",
             "Only 99$"]
testX = vectorizer.transform(testTexts)

In [9]:
testResults = fullModel.predict(testX)
" ".join(map(str, testResults))

'1 1 0 0 0'

In [10]:
ngramRange = [(2,2), (3,3), (1,3)]

ngramLogRegF1Results = []
for ngram in ngramRange:
    ngramVectorizer = CountVectorizer(ngram_range=ngram)
    ngramX = ngramVectorizer.fit_transform(dataset["text"])
    ngramModel = LogisticRegression()
    ngramScores = cross_val_score(ngramModel, ngramX, dataset["class"], cv = 10, scoring = "f1_macro", n_jobs = -1)
    ngramLogRegF1Results.append(mean(ngramScores))
    
" ".join(map(str, ngramLogRegF1Results))

'0.899819354902 0.846363819103 0.957238064765'

In [11]:
ngramNBF1Results = []
for ngram in ngramRange:
    ngramVectorizer = CountVectorizer(ngram_range=ngram)
    ngramX = ngramVectorizer.fit_transform(dataset["text"])
    ngramModel = MultinomialNB()
    ngramScores = cross_val_score(ngramModel, ngramX, dataset["class"], cv = 10, scoring = "f1_macro", n_jobs = -1)
    ngramNBF1Results.append(mean(ngramScores))

" ".join(map(str, ngramNBF1Results))

'0.777511294683 0.521878369745 0.934612861215'

В целом, логистическая регрессия показывает более стабильный результат, как на одних 2-граммах, 3-граммах, так и на их комбинациях. Наивный Байес, напротив, очень зависит от того, какие N-граммы используются.
    
Также, на данном датасете логичтическая регрессия с выделением признаков через TfidfVectorixer дает меньший результат, чем с выделением через CountVectorizer.

Общий вывод - лучше использовать 1-3-граммы, нежели другие комбинации (кроме 1-грамм), однако стоит помнить о переобучении. Также стоит проверять работу Tfidf на каждом новом датасете.