In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

In [2]:
def hamConverter(value):
    return 0 if value == "ham" else 1

In [3]:
def get_best_score(vectorizer, ngrams=(1, 1), verbose=True):
    vectorizer = vectorizer(ngram_range=ngrams)
    X = vectorizer.fit_transform(data.msg)
    
    best_res = -1
    best_c = -1

    for c in [0.01, 0.1, 0.5, 1, 5, 10, 100, 200, 500, 1000, 10000, 15000, 20000, 100000]:
        cls = LogisticRegression(C=c)
        res = cross_val_score(cls, X, data.is_spam, scoring="f1", cv=10)
        if verbose:
            print c, '\t', np.mean(res), np.std(res)

        if np.mean(res) > best_res:
            best_c = c
            best_res = np.mean(res)
            
    return best_res, best_c, X

In [4]:
data = pd.read_csv("SMSSpamCollection.txt", sep='\t', header = None, names = ["is_spam", "msg"], converters={"is_spam": hamConverter})

In [5]:
data.head()

Unnamed: 0,is_spam,msg
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
ans1, c1, X = get_best_score(CountVectorizer)

0.01 	0.783702308103 0.0506152668394
0.1 	0.904201830786 0.022523808587
0.5 	0.92667510292 0.0202082258722
1 	0.932640298361 0.0195638214575
5 	0.939999527925 0.0176300090449
10 	0.938478680146 0.018614781767
100 	0.939771313827 0.0184975476903
200 	0.939090285288 0.0187862153269
500 	0.939771313827 0.0184975476903
1000 	0.940431217304 0.0185068633068
10000 	0.939763925051 0.0190714694705
15000 	0.940462582893 0.0201185268017
20000 	0.940462582893 0.0201185268017
100000 	0.943449680811 0.0197759361751


In [7]:
print "Answer 1: "
print ans1

Answer 1: 
0.943449680811


In [8]:
cls = LogisticRegression(C=c1).fit(X, data.is_spam)

samples = ["FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB",
            "FreeMsg: Txt: claim your reward of 3 hours talk time",
            "Have you visited the last lecture on physics?",
            "Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$",
            "Only 99$"]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data.msg)
sample = vectorizer.transform(samples)
res = cls.predict(sample)
print "Answer 2: "
print " ".join([str(i) for i in res])
# print cls.predict_proba(sample)

Answer 2: 
1 1 0 0 0


In [9]:
scores1 = []

scores1.append(get_best_score(CountVectorizer, (2, 2), False))
scores1.append(get_best_score(CountVectorizer, (3, 3), False))
scores1.append(get_best_score(CountVectorizer, (1, 3), False))

  'precision', 'predicted', average, warn_for)


In [10]:
print "Answer 3: "
print " ".join(["{0:0.2f}".format(i) for i, _, _ in scores1])

Answer 3: 
0.88 0.81 0.93


In [11]:
scores2 = []
cls = MultinomialNB().fit(X, data.is_spam)


for i in [(2, 2), (3, 3), (1, 3)]:
    vectorizer = CountVectorizer(ngram_range=i)
    X = vectorizer.fit_transform(data.msg)
    
    res = cross_val_score(cls, X, data.is_spam, scoring="f1_macro", cv=10)
    
    scores2.append(np.mean(res))

In [12]:
print "Answer 4: "
print " ".join(["{0:0.2f}".format(i) for i in scores2])

Answer 4: 
0.78 0.52 0.93


In [13]:
ans2, _, _ = get_best_score(TfidfVectorizer)

0.01 	0.0 0.0
0.1 	0.03930371383 0.0129202999727
0.5 	0.761097632769 0.0306869502043
1 	0.852859955417 0.0238364215221
5 	0.917434461744 0.018158063199
10 	0.922353655088 0.0157037826859
100 	0.931374911017 0.0185229390593
200 	0.932865150114 0.0188244642508
500 	0.934365972683 0.0176972537845
1000 	0.934365972683 0.0176972537845
10000 	0.938155468008 0.0167418311994
15000 	0.938155468008 0.0167418311994
20000 	0.937495017347 0.0163457532345
100000 	0.937495017347 0.0163457532345


In [14]:
print "Answer 5: "
print ans2

Answer 5: 
0.938155468008
