In [1]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df = pd.read_table("SMSSpamCollection", header = None)
print("Sample Size : ",df.shape[0] )
df.head()

Sample Size :  5572


Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
X = df[1].values
y = df[0].values

In [4]:
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y,test_size=0.25,random_state=42)

In [5]:
print("training set size : ",X_train.shape[0])
print("testing set size : ",X_test.shape[0])

training set size :  4179
testing set size :  1393


In [6]:
cv = CountVectorizer()
X_train_vec = cv.fit_transform(X_train)

In [7]:
le = LabelEncoder()
y_train_labels = le.fit_transform(y_train)

In [8]:
clf_lr = LogisticRegression()
clf_lr.fit(X_train_vec,y_train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [9]:
clf_dt = DecisionTreeClassifier()
clf_dt.fit(X_train_vec,y_train_labels)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [10]:
y_pred_labels_lr = clf_lr.predict(cv.transform(X_test))

In [11]:
y_pred_labels_dt = clf_dt.predict(cv.transform(X_test))

In [12]:
y_test_labels = le.transform(y_test)

In [13]:
print(classification_report(y_test_labels,y_pred_labels_lr))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1206
           1       1.00      0.84      0.91       187

    accuracy                           0.98      1393
   macro avg       0.99      0.92      0.95      1393
weighted avg       0.98      0.98      0.98      1393



In [14]:
print(confusion_matrix(y_test_labels,y_pred_labels_lr))

[[1206    0]
 [  30  157]]


In [15]:
print(classification_report(y_test_labels,y_pred_labels_dt))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1206
           1       0.92      0.81      0.86       187

    accuracy                           0.97      1393
   macro avg       0.95      0.90      0.92      1393
weighted avg       0.96      0.97      0.96      1393



In [16]:
print(confusion_matrix(y_test_labels,y_pred_labels_dt))

[[1193   13]
 [  35  152]]
