In [5]:
import numpy as np
import pandas as pd


In [6]:
df = pd.read_csv('smsspamcollection.tsv',sep='\t')

In [7]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X = df['message']
y= df['label']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

Scikit-learn's CountVectorizer
Text preprocessing, tokenizing and the ability to filter out stopwords are all included in CountVectorizer, which builds a dictionary of features and transforms documents to feature vectors.

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

In [12]:
X_train_counts = count_vect.fit_transform(X_train)

In [13]:
X_train_counts.shape

(3733, 7082)

In [14]:
# the training set has 3733 doc & 7082 features

In [15]:
#transform count to freq with Tf-idf

In [16]:
from sklearn.feature_extraction.text import TfidfTransformer

In [17]:
tfidf_transformer = TfidfTransformer()

In [21]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [19]:
X_train_tfidf.shape

(3733, 7082)

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

In [23]:
X_train_tfidf = vectorizer.fit_transform(X_train) 
X_train_tfidf.shape

(3733, 7082)

# Fit Linear SVC
#LinearSVC handles sparse input better, and scales well to large numbers of samples

In [24]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

# Build a Pipeline

In [26]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train) 

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

# Test the classifier and display results

In [27]:
# Form a prediction set
predictions = text_clf.predict(X_test)

In [28]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[1586    7]
 [  12  234]]


In [29]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.97      0.95      0.96       246

   micro avg       0.99      0.99      0.99      1839
   macro avg       0.98      0.97      0.98      1839
weighted avg       0.99      0.99      0.99      1839



In [30]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.989668297988037
