In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('smsspamcollection.tsv',sep='\t')

We have already performed analysis and visualization of the data in the other notebook, so we will directly get down to implementation here using the messages columns.

In [3]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [4]:
X = df['message']
y = df['label']

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

In [7]:
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(3733, 7082)

transfrom to tf-idf

In [8]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(3733, 7082)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train) # remember to use the original X_train set
X_train_tfidf.shape

(3733, 7082)

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC


In [11]:
svc_model = LinearSVC()
svc_model.fit(X_train_tfidf,y_train)
# predictions = svc_model.predict(X_test)
# print(metrics.classification_report(y_test,predictions))
# print(metrics.accuracy_score(y_test,predictions))

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [12]:
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)  

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [13]:
predictions = text_clf.predict(X_test)

In [14]:
from sklearn import metrics


print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.97      0.95      0.96       246

   micro avg       0.99      0.99      0.99      1839
   macro avg       0.98      0.97      0.98      1839
weighted avg       0.99      0.99      0.99      1839



In [15]:
print(metrics.accuracy_score(y_test,predictions))

0.989668297988037


In [16]:
from sklearn.naive_bayes import MultinomialNB
multinb = MultinomialNB()

In [17]:
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB()),
])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)  

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...rue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [18]:
predictions = text_clf.predict(X_test)
print(metrics.classification_report(y_test,predictions))
print(metrics.accuracy_score(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1593
        spam       1.00      0.72      0.83       246

   micro avg       0.96      0.96      0.96      1839
   macro avg       0.98      0.86      0.91      1839
weighted avg       0.96      0.96      0.96      1839

0.9619358346927678


In [19]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LogisticRegression()),
])

text_clf.fit(X_train, y_train)  

predictions = text_clf.predict(X_test)
print(metrics.classification_report(y_test,predictions))
print(metrics.accuracy_score(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1593
        spam       0.99      0.80      0.88       246

   micro avg       0.97      0.97      0.97      1839
   macro avg       0.98      0.90      0.93      1839
weighted avg       0.97      0.97      0.97      1839

0.9717237629146275




In [20]:
from sklearn.naive_bayes import BernoulliNB
bernoulli = BernoulliNB()

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', BernoulliNB()),
])

text_clf.fit(X_train, y_train)  

predictions = text_clf.predict(X_test)
print(metrics.classification_report(y_test,predictions))
print(metrics.accuracy_score(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1593
        spam       1.00      0.85      0.92       246

   micro avg       0.98      0.98      0.98      1839
   macro avg       0.99      0.92      0.95      1839
weighted avg       0.98      0.98      0.98      1839

0.9798803697661773


In [21]:
from sklearn.tree import DecisionTreeClassifier

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', DecisionTreeClassifier()),
])

text_clf.fit(X_train, y_train)  

predictions = text_clf.predict(X_test)
print(metrics.classification_report(y_test,predictions))
print(metrics.accuracy_score(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.98      0.98      0.98      1593
        spam       0.88      0.84      0.86       246

   micro avg       0.96      0.96      0.96      1839
   macro avg       0.93      0.91      0.92      1839
weighted avg       0.96      0.96      0.96      1839

0.9635671560630777


In [22]:
from sklearn.ensemble import RandomForestClassifier
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', RandomForestClassifier()),
])

text_clf.fit(X_train, y_train)  

predictions = text_clf.predict(X_test)
print(metrics.classification_report(y_test,predictions))
print(metrics.accuracy_score(y_test,predictions))


              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1593
        spam       0.99      0.78      0.88       246

   micro avg       0.97      0.97      0.97      1839
   macro avg       0.98      0.89      0.93      1839
weighted avg       0.97      0.97      0.97      1839

0.9706362153344209


