In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [71]:
spamCollection = pd.read_csv("SMSSpamCollection", sep='\t' , names =["response", "message"])

In [72]:
spamCollection.head(7)

Unnamed: 0,response,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...


In [73]:
X = spamCollection[["message"]]
y = spamCollection["response"]

In [74]:
from sklearn.model_selection import train_test_split

In [75]:
X_train, X_test,y_train, y_test = train_test_split(X,y, test_size = .25, random_state = 1)

In [76]:
X_train.head(1)

Unnamed: 0,message
710,4mths half price Orange line rental & latest c...


In [77]:
y_train[:2]

710     spam
3740     ham
Name: response, dtype: object

# Clean up 1 message

In [78]:
sentence = X_train.iloc[0,0]
sentence

'4mths half price Orange line rental & latest camera phones 4 FREE. Had your phone 11mths+? Call MobilesDirect free on 08000938767 to update now! or2stoptxt T&Cs'

In [81]:
# here we import two liberaries.
#  1. string for work on string.
#     1.1 we import string libarary for remove punctuation(special) words
#           like ===== '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' =============
#  2. we import nltk package for remove stop words
#      
import string
from nltk.corpus import stopwords

In [84]:
# 1. here we filter not punctuation but also number like: "0123456789"
# 2. using join funciton for making sentance.
removepunctuation = [character for character in sentence if character not in string.punctuation +"0123456789"]
sentance2 =  ''.join(removepunctuation)
sentance2

'mths half price Orange line rental  latest camera phones  FREE Had your phone mths Call MobilesDirect free on  to update now orstoptxt TCs'

In [85]:
from nltk.corpus import stopwords

In [86]:
# here we can see stopwords in a list or array.
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [87]:
# here again split an sentance.
sentance2 = sentance2.split()

In [88]:
# here we can see the result(split) an sentance.
print(sentance2)

['mths', 'half', 'price', 'Orange', 'line', 'rental', 'latest', 'camera', 'phones', 'FREE', 'Had', 'your', 'phone', 'mths', 'Call', 'MobilesDirect', 'free', 'on', 'to', 'update', 'now', 'orstoptxt', 'TCs']


In [89]:
[word.lower() for word in sentance2 if word.lower() not in stopwords.words("english")]

['mths',
 'half',
 'price',
 'orange',
 'line',
 'rental',
 'latest',
 'camera',
 'phones',
 'free',
 'phone',
 'mths',
 'call',
 'mobilesdirect',
 'free',
 'update',
 'orstoptxt',
 'tcs']

# To clean up all the messages

In [90]:
a = string.punctuation + "0123456789"

In [91]:
# here we made a custom function for convert string into stopwordless array or list in rows.
def cleanup_text(sentence):
    p = [char for char in sentence if char not in a]
    p = "".join(p)
    q = [word.lower() for word in p.split() if word.lower() not in stopwords.words("english")]
    return(q)

In [92]:
# here by using apply function for applied entire row in message column.
X_train.message.apply(cleanup_text)

710     [mths, half, price, orange, line, rental, late...
3740                                    [stitch, trouser]
2711    [hope, enjoyed, new, content, text, stop, unsu...
3155    [heard, u, call, rude, chat, private, line, cu...
3748         [ü, neva, tell, noe, im, home, da, aft, wat]
                              ...                        
905     [getting, worried, derek, taylor, already, ass...
5192    [oh, oh, den, muz, change, plan, liao, go, bac...
3980    [ceri, u, rebel, sweet, dreamz, little, buddy,...
235     [text, meet, someone, sexy, today, u, find, da...
5157                                    [k, k, sms, chat]
Name: message, Length: 4179, dtype: object

In [93]:
#Prepare our feature compatible for ML algorithm
#Sklearn helps you to achieve this target
# text Features ====> BagOfWords ===> TF-IDF ===> Freq ==> Feature Array
from sklearn.feature_extraction.text import CountVectorizer

In [94]:
bow = CountVectorizer(analyzer = cleanup_text)

In [95]:
# BagOfWords : create dictionary which contain text and it's counts.
# like: (hello,272234)
bow.fit(X_train["message"])

CountVectorizer(analyzer=<function cleanup_text at 0x00000212896A0168>,
                binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [96]:
# convert count info frequency between (0-1) 
msg_bows=bow.transform(X_train.message)


In [97]:
msg_bows

<4179x7260 sparse matrix of type '<class 'numpy.int64'>'
	with 35543 stored elements in Compressed Sparse Row format>

In [98]:
msg_bows.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [99]:
msg_bows.shape

(4179, 7260)

In [100]:
#Create Model
from sklearn.naive_bayes import MultinomialNB#Best for Text Data Features
X_train = msg_bows
nb = MultinomialNB()

In [102]:
nb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [104]:
type(X_train)

scipy.sparse.csr.csr_matrix

In [105]:
nb.score(X_train, y_train)

0.9909069155300311

In [106]:
X_test[:2]

Unnamed: 0,message
1078,"Yep, by the pretty sculpture"
4028,"Yes, princess. Are you going to make me moan?"


In [107]:
#Create Features
test_bows = bow.transform(X_test.message)

In [108]:
# create two dimessional array
test_bows.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [109]:
ypred = nb.predict(test_bows)

In [110]:
ypred

array(['ham', 'ham', 'ham', ..., 'ham', 'spam', 'ham'], dtype='<U4')

In [111]:
nb.score(test_bows, ypred)

1.0

In [129]:
test_bows = bow.transform([["every morning is new opprotunity"],["sorry,i'll call later"],['life is greate']])
test_bows.toarray()
nb.predict(test_bows)

array(['ham', 'ham', 'ham'], dtype='<U4')

In [132]:
testsms =  np.array(["Win Lottery Guaranteed!!!","sorry,i'll call later",'life is greate',"hard working always win","every morning is new opprotunity"])
testsms.ndim

1

In [138]:
testsms2 = np.array([testsms])
testsms2.ndim

2

In [136]:
test_bows = bow.transform(testsms2)
test_bows.toarray()
nb.predict(test_bows)

array(['ham'], dtype='<U4')

In [139]:
testsms =  np.array([["Win Lottery Guaranteed!!!"],["sorry,i'll call later"],['life is greate'],["hard working always win"],["every morning is new opprotunity"]])
testsms.ndim

2

In [140]:
test_bows = bow.transform(testsms)
test_bows.toarray()
nb.predict(test_bows)

array(['spam', 'ham', 'ham', 'ham', 'ham'], dtype='<U4')