### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report



from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re



**Context**

The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged acording being ham (legitimate) or spam.

**Content**

The files contain one message per line. Each line is composed by two columns: v1 contains the label (ham or spam) and v2 contains the raw text.


In [2]:
data = pd.read_csv('C:\\Users\\dell\\Desktop\\spam.csv', encoding = 'latin-1')

In [3]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
data = data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)

In [5]:
data.rename(columns = {"v1": "label", "v2":"Message"}, inplace = True) 

In [6]:
data.head()

Unnamed: 0,label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Handling Categorical Data

In [7]:
data = pd.get_dummies(data, columns=['label'])

In [8]:
data.head()

Unnamed: 0,Message,label_ham,label_spam
0,"Go until jurong point, crazy.. Available only ...",1,0
1,Ok lar... Joking wif u oni...,1,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,0,1
3,U dun say so early hor... U c already then say...,1,0
4,"Nah I don't think he goes to usf, he lives aro...",1,0


In [9]:
# Total ham(1) and spam(0) messages
data['label_ham'].value_counts()

1    4825
0     747
Name: label_ham, dtype: int64

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 3 columns):
Message       5572 non-null object
label_ham     5572 non-null uint8
label_spam    5572 non-null uint8
dtypes: object(1), uint8(2)
memory usage: 54.5+ KB


In [11]:
data['Count']=0
for i in np.arange(0,len(data.Message)):
    data.loc[i,'Count'] = len(data.loc[i,'Message'])

In [12]:
data.head()

Unnamed: 0,Message,label_ham,label_spam,Count
0,"Go until jurong point, crazy.. Available only ...",1,0,111
1,Ok lar... Joking wif u oni...,1,0,29
2,Free entry in 2 a wkly comp to win FA Cup fina...,0,1,155
3,U dun say so early hor... U c already then say...,1,0,49
4,"Nah I don't think he goes to usf, he lives aro...",1,0,61


In [13]:
data.describe()

Unnamed: 0,label_ham,label_spam,Count
count,5572.0,5572.0,5572.0
mean,0.865937,0.134063,80.118808
std,0.340751,0.340751,59.690841
min,0.0,0.0,2.0
25%,1.0,0.0,36.0
50%,1.0,0.0,61.0
75%,1.0,0.0,121.0
max,1.0,1.0,910.0


# <font color = "green">**Processing Message** </font>

In [14]:
# Let's look at the Original Messages
print ("MESSAGE 1 : \n", data['Message'][0], "\n\n")
print ("MESSAGE 2 : \n", data['Message'][1], "\n\n")

MESSAGE 1 : 
 Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat... 


MESSAGE 2 : 
 Ok lar... Joking wif u oni... 




### Preparing WordVector Corpus

In [15]:
corpus = []

### Using Porter Stemmer

In [16]:
ps = PorterStemmer()

In [17]:
for i in range(0, 5572):

    # Applying Regular Expression
    
    '''
    Replace email addresses with 'emailaddr'
    Replace URLs with 'httpaddr'
    Replace money symbols with 'moneysymb'
    Replace phone numbers with 'phonenumbr'
    Replace numbers with 'numbr'
    '''
    msg = data['Message'][i]
    msg = re.sub('\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailaddr', data['Message'][i])
    msg = re.sub('(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'httpaddr', data['Message'][i])
    msg = re.sub('£|\$', 'moneysymb', data['Message'][i])
    msg = re.sub('\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', 'phonenumbr', data['Message'][i])
    msg = re.sub('\d+(\.\d+)?', 'numbr', data['Message'][i])
    
    ''' Remove all punctuations '''
    msg = re.sub('[^\w\d\s]', ' ', data['Message'][i])
    
    if i<2:
        print("\t\t\t\t MESSAGE ", i)
    
    if i<2:
        print("\n After Regular Expression - Message ", i, " : ", msg)
    
    # Each word to lower case
    msg = msg.lower()    
    if i<2:
        print("\n Lower case Message ", i, " : ", msg)
    
    # Splitting words to Tokenize
    msg = msg.split()    
    if i<2:
        print("\n After Splitting - Message ", i, " : ", msg)
    
    # Stemming with PorterStemmer handling Stop Words
    msg = [ps.stem(word) for word in msg if not word in set(stopwords.words('english'))]
    if i<2:
        print("\n After Stemming - Message ", i, " : ", msg)
    
    # preparing Messages with Remaining Tokens
    msg = ' '.join(msg)
    if i<2:
        print("\n Final Prepared - Message ", i, " : ", msg, "\n\n")
    
    # Preparing WordVector Corpus
    corpus.append(msg)

				 MESSAGE  0

 After Regular Expression - Message  0  :  Go until jurong point  crazy   Available only in bugis n great world la e buffet    Cine there got amore wat   

 Lower case Message  0  :  go until jurong point  crazy   available only in bugis n great world la e buffet    cine there got amore wat   

 After Splitting - Message  0  :  ['go', 'until', 'jurong', 'point', 'crazy', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'there', 'got', 'amore', 'wat']

 After Stemming - Message  0  :  ['go', 'jurong', 'point', 'crazi', 'avail', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amor', 'wat']

 Final Prepared - Message  0  :  go jurong point crazi avail bugi n great world la e buffet cine got amor wat 


				 MESSAGE  1

 After Regular Expression - Message  1  :  Ok lar    Joking wif u oni   

 Lower case Message  1  :  ok lar    joking wif u oni   

 After Splitting - Message  1  :  ['ok', 'lar', 'joking', 'wif', '

# <font color = "green">**Preparing Vectors for each message** </font>

In [18]:
cv = CountVectorizer()
data_input = cv.fit_transform(corpus).toarray()

# <font color = "green">**Applying Classification** </font>

- **Input = Prepared Sparse Matrix / Vectors for each message**

- **Output = Label i.e. Spam or Ham**

In [19]:
data_output = data['label_ham']
print (data_output.value_counts())

print(data_output[0])
print(data_output[1])

1    4825
0     747
Name: label_ham, dtype: int64
1
1


#### Splitting data for Training and Testing

In [20]:
train_x, test_x, train_y, test_y = train_test_split(data_input, data_output,test_size= 0.20, random_state = 0)

## <font color = "green">Preparing ML Models</font>

### Training

In [21]:

model_nvb = GaussianNB()
model_nvb.fit(train_x, train_y)

model_rf = RandomForestClassifier(n_estimators=1000, random_state=0)
model_rf.fit(train_x, train_y)

model_dt = tree.DecisionTreeClassifier()
model_dt.fit(train_x, train_y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

### Prediction

In [22]:
prediction_nvb = model_nvb.predict(test_x)
prediction_rf = model_rf.predict(test_x)
prediction_dt = model_dt.predict(test_x)

### Results Naive Bayes

In [23]:
print ("Accuracy for Naive Bayes : %0.5f \n\n" % accuracy_score(test_y, prediction_nvb))
print ("Classification Report Naive bayes: \n", classification_report(test_y, prediction_nvb))

Accuracy for Naive Bayes : 0.87085 


Classification Report Naive bayes: 
              precision    recall  f1-score   support

          0       0.54      0.89      0.67       166
          1       0.98      0.87      0.92       949

avg / total       0.91      0.87      0.88      1115



### Results Decision Tree

In [25]:
print ("Accuracy for Decision Tree: %0.5f \n\n" % accuracy_score(test_y, prediction_dt))
print ("Classification Report Decision Tree: \n", classification_report(test_y, prediction_dt))

Accuracy for Decision Tree: 0.96771 


Classification Report Decision Tree: 
              precision    recall  f1-score   support

          0       0.95      0.83      0.88       166
          1       0.97      0.99      0.98       949

avg / total       0.97      0.97      0.97      1115



### Results Random Forest

In [24]:
print ("Accuracy for Random Forest: %0.5f \n\n" % accuracy_score(test_y, prediction_rf))
print ("Classification Report Random Forest: \n", classification_report(test_y, prediction_rf))

Accuracy for Random Forest: 0.97309 


Classification Report Random Forest: 
              precision    recall  f1-score   support

          0       1.00      0.82      0.90       166
          1       0.97      1.00      0.98       949

avg / total       0.97      0.97      0.97      1115

