In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv("sms1.tsv", delimiter='\t', header=None)

In [3]:
dataset[1]

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
5       FreeMsg Hey there darling it's been 3 week's n...
6       Even my brother is not like to speak with me. ...
7       As per your request 'Melle Melle (Oru Minnamin...
8       WINNER!! As a valued network customer you have...
9       Had your mobile 11 months or more? U R entitle...
10      I'm gonna be home soon and i don't want to tal...
11      SIX chances to win CASH! From 100 to 20,000 po...
12      URGENT! You have won a 1 week FREE membership ...
13      I've been searching for the right words to tha...
14                    I HAVE A DATE ON SUNDAY WITH WILL!!
15      XXXMobileMovieClub: To use your credit, click ...
16                             Oh k...i'm watching here:)
17      Eh u r

In [4]:
import re
import nltk
from nltk.stem.porter import PorterStemmer

In [5]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/blooser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
from nltk.corpus import stopwords
corpus = []
columns = dataset[0].count()
for i in range(0, columns):
    msg = re.sub("[^a-zA-Z]", " ", dataset[1][i])
    msg = msg.lower()
    msg = msg.split()
    ps = PorterStemmer()
    msg = [ps.stem(word) for word in msg if not word in set(stopwords.words('english'))]
    msg = ' '.join(msg)
    corpus.append(msg)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()

In [8]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [9]:
y = dataset.iloc[:, 0].values

In [10]:
y

array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'ham'], dtype=object)

In [11]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
label_y = LabelEncoder()
y = label_y.fit_transform(y)

In [12]:
y

array([0, 0, 1, ..., 0, 0, 0])

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [21]:
from sklearn.ensemble import RandomForestClassifier
classifier =  RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)

In [25]:
from sklearn.model_selection import GridSearchCV
parameters = [{'n_estimators': [10, 100, 200, 300]}]
cv = GridSearchCV(estimator=classifier, param_grid=parameters, scoring='accuracy')
cv = cv.fit(X_train, y_train)

In [42]:
def percent(value):
    return "{0:.0%}".format(value)

print("Best Accuracy: {}".format(percent(cv.best_score_))) 

Best Accuracy: 98%


In [31]:
cv.best_params_

{'n_estimators': 200}

In [33]:
classifier =  RandomForestClassifier(n_estimators=200, criterion='entropy', random_state=42)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [34]:
from sklearn.model_selection import cross_val_score
accuracy = cross_val_score(classifier, X_train, y_train)

In [35]:
print("Accuracy: {}".format(percent(accuracy.mean())))
print("Loss: {}".format(percent(accuracy.std())))

Accuracy: 98%
Loss: 0%


In [36]:
from sklearn.metrics import confusion_matrix

y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

In [37]:
cm

array([[966,   0],
       [ 24, 125]])

In [43]:
TP, FP, FN, TN = cm[0][0], cm[0][1], cm[1][0], cm[1][1]

ACC = (TP + TN)/(TP+FP+FN+TN)
PPV = TP/(TP+FP)
TPR = TP/(TP+FN)
F1 = (2*PPV*TPR)/(PPV+TPR)

In [44]:
import csv 

name = "result.csv"
with open(name, 'w') as file:
    writer = csv.writer(file)
    titles = ['True Positives', 'False Positives', 'False Negatives', 
              'True Negatives', 'Accuracy', 'Precision', 'Sensitivity',
             'F1']
    writer.writerow(titles)
    values = [TP, FP, FN, TN, percent(ACC), percent(PPV), percent(TPR), percent(F1)]   
    writer.writerow(values)

In [45]:
total_summary = pd.read_csv('result.csv')
total_summary

Unnamed: 0,True Positives,False Positives,False Negatives,True Negatives,Accuracy,Precision,Sensitivity,F1
0,966,0,24,125,98%,100%,98%,99%
