In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv("sms1.tsv", delimiter='\t', header=None)

In [4]:
dataset.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
dataset[0].value_counts()

ham     4825
spam     747
Name: 0, dtype: int64

In [5]:
import re
import nltk
from nltk.stem.porter import PorterStemmer

In [6]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/blooser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
from nltk.corpus import stopwords

def prepare_text():
    corpus = []
    columns = dataset[0].count()
    for i in range(0, columns):
        msg = re.sub("[^a-zA-Z]", " ", dataset[1][i])
        msg = msg.lower().split()
        ps = PorterStemmer()
        msg = [ps.stem(word) for word in msg if not word in set(stopwords.words('english'))]
        msg = ' '.join(msg)
        corpus.append(msg)
    return corpus

In [8]:
prepared_text = prepare_text()

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(prepared_text).toarray()
y = dataset.iloc[:, 0].values

In [13]:
y

array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'ham'], dtype=object)

In [35]:
from sklearn.preprocessing import LabelBinarizer

label_y = LabelBinarizer()
y = label_y.fit_transform(y).ravel()

In [37]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
from sklearn.ensemble import RandomForestClassifier

classifier =  RandomForestClassifier(random_state=42)

In [54]:
from sklearn.model_selection import GridSearchCV

parameters = [{'n_estimators': [10, 100, 200, 300]}]
grid_search = GridSearchCV(classifier, parameters, cv=4, verbose=3, n_jobs=-1)
grid_search = grid_search.fit(X_train, y_train)

Fitting 4 folds for each of 4 candidates, totalling 16 fits
[CV] n_estimators=10 .................................................
[CV] n_estimators=10 .................................................
[CV] n_estimators=10 .................................................
[CV] n_estimators=10 .................................................
[CV] ........ n_estimators=10, score=0.9748878923766816, total=   1.5s
[CV] n_estimators=100 ................................................
[CV] ......... n_estimators=10, score=0.968609865470852, total=   1.6s
[CV] n_estimators=100 ................................................
[CV] ........ n_estimators=10, score=0.9694793536804309, total=   1.6s
[CV] n_estimators=100 ................................................
[CV] ........ n_estimators=10, score=0.9712488769092543, total=   1.6s
[CV] n_estimators=100 ................................................
[CV] ....... n_estimators=100, score=0.9838565022421525, total=  15.3s
[CV] n_estimators

[Parallel(n_jobs=-1)]: Done  16 out of  16 | elapsed:  1.6min finished


In [61]:
print("Best parameters: {0}".format(grid_search.best_params_))
print("Best accuracy: {0:.0f}%".format(100*grid_search.best_score_))

Best parameters: {'n_estimators': 200}
Best accuracy: 98%


In [63]:
classifier =  RandomForestClassifier(n_estimators=200, random_state=42)

In [65]:
from sklearn.model_selection import cross_val_predict

y_train_predict = cross_val_predict(classifier, X_train, y_train, cv=5)

In [66]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train, y_train_predict)

array([[3848,   11],
       [  87,  511]])

In [67]:
from sklearn.metrics import f1_score

f1_score(y_train, y_train_predict)

0.9125