In [18]:
import pandas as pd
import numpy as np

# Loading, Exploring and Cleaning data

In [4]:
df=pd.read_csv('SMSSpamCollection.tsv',delimiter='\t')
df.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.describe()

Unnamed: 0,Label,SMS
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [5]:
df.drop_duplicates(subset='SMS',inplace=True)
df.describe()

Unnamed: 0,Label,SMS
count,5169,5169
unique,2,5169
top,ham,how tall are you princess?
freq,4516,1


In [125]:
df.groupby('Label').count().reset_index()

Unnamed: 0,Label,SMS
0,ham,4516
1,spam,653


# Separate Training and Test Data

In [56]:
from sklearn.model_selection import *
x=df['SMS']
y=df['Label']
xtrain,xtest,ytrain,ytest=train_test_split(x,y,random_state=12)
print('Shape of x: {}'.format(x.shape))
print('Shapes of xtrain and ytrain: {},{}'.format(xtrain.shape,ytrain.shape))
print('Shapes of xtest and ytest: {},{}'.format(xtest.shape,ytest.shape))
traincorpus=list(xtrain)

"""For calculating precision and recall, we create an array from ytrain where 1 is spam and 0 is ham"""

y1=list(ytrain)
a=[]
for i in range(len(ytrain)):
    if y1[i]=='spam':
        a.append(1)
    else:
        a.append(0)

Shape of x: (5169,)
Shapes of xtrain and ytrain: (3876,),(3876,)
Shapes of xtest and ytest: (1293,),(1293,)


# Build Featurizer

In [9]:
from sklearn.feature_extraction.text import *
vectorizer=TfidfVectorizer(max_features=5000)
vectorizer.fit(traincorpus)
xvtrain=vectorizer.transform(traincorpus)
print('Shape of xvtrain: {}'.format(xvtrain.shape))

Shape of xvtrain: (3876, 5000)


# Train the model using Logistic Regression

In [None]:
"""Here we use Logistic Regression for modelling. 
The algorithm's prediction is passed to a sigmoid function and a threshold (default 0.5) is set. 
The value of the functon falling on either side of the threshold directs the model to classify it to the relevent class. """

In [10]:
from sklearn.linear_model import *
model=LogisticRegression(random_state=12)
model.fit(xvtrain,ytrain)
from sklearn.metrics import *
yptrain=model.predict(xvtrain)
print('Number of correctly classified samples: {}'.format(accuracy_score(ytrain,yptrain,normalize=False)))
print('Fraction of correctly classifed samples: {}'.format(accuracy_score(ytrain,yptrain)))
pd.DataFrame(confusion_matrix(ytrain,yptrain),index=('True ham','True spam'),columns=('Predicted ham','Predicted spam'))

Number of correctly classified samples: 3779
Fraction of correctly classifed samples: 0.9749742002063984




Unnamed: 0,Predicted ham,Predicted spam
True ham,3375,2
True spam,95,404


### Cross validation score

In [11]:
mcv=LogisticRegression()
cross_val_score(mcv,xvtrain,ytrain,cv=5)



array([0.95747423, 0.95231959, 0.95225806, 0.9483871 , 0.95348837])

### Precision and Recall

In [58]:
from sklearn.metrics import *
y2=list(yptrain)
b=[]
for i in range(len(ytrain)):
    if y2[i]=='spam':
        b.append(1)
    else:
        b.append(0)
print('The precision and recall scores are {},{}'.format(precision_score(a,b),recall_score(a,b)))

The precision and recall scores are 0.9950738916256158,0.8096192384769539


### Evaluation on test data

In [116]:
xvtest=vectorizer.transform(list(xtest))
print('Shape of xvtest: {}'.format(xvtest.shape))
yptest=model.predict(xvtest)
print("The fraction of correctly classified samples: {}".format(accuracy_score(ytest, yptest)))
print("The number of correctly classified samples: {}".format(accuracy_score(ytest, yptest, normalize=False)))
pd.DataFrame(confusion_matrix(ytest,yptest),index=('True ham','True spam'),columns=('Predicted ham','Predicted spam'))

Shape of xvtest: (1293, 5000)
The fraction of correctly classified samples: 0.9621036349574633
The number of correctly classified samples: 1244


Unnamed: 0,Predicted ham,Predicted spam
True ham,1138,1
True spam,48,106


# Train the model using Support Vector Machine

In [None]:
""" Here we use Support Vector Machine (SVM) for modelling.
The model tries to figure out a hyperplane which maximizes the distance between the plane 
and different nearest points (excluding outliers) in the data. """

In [51]:
from sklearn import *
model1=svm.SVC(kernel='linear', C=1, gamma=1, random_state=12)
model1.fit(xvtrain,ytrain)
yptrain1=model1.predict(xvtrain)
print('Number of correctly classified samples: {}'.format(accuracy_score(ytrain,yptrain1,normalize=False)))
print('Fraction of correctly classifed samples: {}'.format(accuracy_score(ytrain,yptrain1)))
pd.DataFrame(confusion_matrix(ytrain,yptrain1),index=('True ham','True spam'),columns=('Predicted ham','Predicted spam'))

Number of correctly classified samples: 3860
Fraction of correctly classifed samples: 0.9958720330237358


Unnamed: 0,Predicted ham,Predicted spam
True ham,3376,1
True spam,15,484


### Cross validation score

In [118]:
mcv=svm.SVC(kernel='linear', C=1, gamma=1, random_state=12)
cross_val_score(mcv,xvtrain,ytrain,cv=5)

array([0.97809278, 0.97551546, 0.98322581, 0.98064516, 0.98449612])

### Precision and Recall

In [59]:
from sklearn.metrics import *
y2=list(yptrain1)
b=[]
for i in range(len(ytrain)):
    if y2[i]=='spam':
        b.append(1)
    else:
        b.append(0)
print('The precision and recall scores are {},{}'.format(precision_score(a,b),recall_score(a,b)))

The precision and recall scores are 0.9979381443298969,0.969939879759519


### Evaluation on test data

In [119]:
yptest=model1.predict(xvtest)
print("The fraction of correctly classified samples: {}".format(accuracy_score(ytest, yptest)))
print("The number of correctly classified samples: {}".format(accuracy_score(ytest, yptest, normalize=False)))
pd.DataFrame(confusion_matrix(ytest,yptest),index=('True ham','True spam'),columns=('Predicted ham','Predicted spam'))

The fraction of correctly classified samples: 0.9775715390564579
The number of correctly classified samples: 1264


Unnamed: 0,Predicted ham,Predicted spam
True ham,1137,2
True spam,27,127


# Train the model using Random Forest

In [None]:
""" Here we use Random Forest modelling. 
The model first selects random samples from a given data sets.
Then it creates decision trees for each samples and gets thme prediction from each tree.
Perform a vote and select the prediction result with the best vote. """

In [53]:
from sklearn.ensemble import *
model2=RandomForestClassifier(n_estimators=750, random_state=12)
model2.fit(xvtrain,ytrain)
yptrain2=model2.predict(xvtrain)
print('Number of correctly classified samples: {}'.format(accuracy_score(ytrain,yptrain2,normalize=False)))
print('Fraction of correctly classifed samples: {}'.format(accuracy_score(ytrain,yptrain2)))
pd.DataFrame(confusion_matrix(ytrain,yptrain2),index=('True ham','True spam'),columns=('Predicted ham','Predicted spam'))

Number of correctly classified samples: 3876
Fraction of correctly classifed samples: 1.0


Unnamed: 0,Predicted ham,Predicted spam
True ham,3377,0
True spam,0,499


### Cross validation score

In [121]:
mcv=RandomForestClassifier(n_estimators=750, random_state=12)
cross_val_score(mcv,xvtrain,ytrain,cv=5)

array([0.97938144, 0.97164948, 0.97290323, 0.97032258, 0.9754522 ])

### Precision and Recall

In [60]:
from sklearn.metrics import *
y2=list(yptrain2)
b=[]
for i in range(len(ytrain)):
    if y2[i]=='spam':
        b.append(1)
    else:
        b.append(0)
print('The precision and recall scores are {},{}'.format(precision_score(a,b),recall_score(a,b)))

The precision and recall scores are 1.0,1.0


### Evaluation on test data

In [122]:
yptest=model2.predict(xvtest)
print("The fraction of correctly classified samples: {}".format(accuracy_score(ytest, yptest)))
print("The number of correctly classified samples: {}".format(accuracy_score(ytest, yptest, normalize=False)))
pd.DataFrame(confusion_matrix(ytest,yptest),index=('True ham','True spam'),columns=('Predicted ham','Predicted spam'))

The fraction of correctly classified samples: 0.9682907965970611
The number of correctly classified samples: 1252


Unnamed: 0,Predicted ham,Predicted spam
True ham,1138,1
True spam,40,114
