     Import Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import string
import matplotlib.pyplot as plt

In [2]:
df = pd.read_table(r'C:\Users\paras\Desktop\desktop\Project\SMSSpamCollection',header=None)

In [3]:
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df[1][1]

'Ok lar... Joking wif u oni...'

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.2+ KB


In [6]:
df.describe()

Unnamed: 0,0,1
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [7]:
df['Length'] = df[1].apply(len)

In [8]:
df['Length']

0       111
1        29
2       155
3        49
4        61
       ... 
5567    160
5568     36
5569     57
5570    125
5571     26
Name: Length, Length: 5572, dtype: int64

In [9]:
df.head()

Unnamed: 0,0,1,Length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [10]:
df.groupby(0).count()

Unnamed: 0_level_0,1,Length
0,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,4825,4825
spam,747,747


In [11]:
df['Length'].describe()

count    5572.000000
mean       80.489950
std        59.942907
min         2.000000
25%        36.000000
50%        62.000000
75%       122.000000
max       910.000000
Name: Length, dtype: float64

      Data Preprocessing

In [12]:
y = df[0].values    

In [13]:
y

array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'ham'], dtype=object)

Convert value of ham = 0 & spam = 1

In [14]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [15]:
y

array([0, 0, 1, ..., 0, 0, 0])

In [16]:
df.head()

Unnamed: 0,0,1,Length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


Process On Message

In [17]:
def cleanMessage(message):
    nonPunc = [char for char in message if char not in string.punctuation]
    nonPunc = "".join(nonPunc)
    return nonPunc

In [18]:
df['Message'] = df[1].apply(cleanMessage)

In [19]:
df['Message']

0       Go until jurong point crazy Available only in ...
1                                 Ok lar Joking wif u oni
2       Free entry in 2 a wkly comp to win FA Cup fina...
3             U dun say so early hor U c already then say
4       Nah I dont think he goes to usf he lives aroun...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                  Will ü b going to esplanade fr home
5569    Pity  was in mood for that Soany other suggest...
5570    The guy did some bitching but I acted like id ...
5571                            Rofl Its true to its name
Name: Message, Length: 5572, dtype: object

In [20]:
df.head()

Unnamed: 0,0,1,Length,Message
0,ham,"Go until jurong point, crazy.. Available only ...",111,Go until jurong point crazy Available only in ...
1,ham,Ok lar... Joking wif u oni...,29,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...,49,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,Nah I dont think he goes to usf he lives aroun...


In [21]:
CV = CountVectorizer(stop_words="english")

In [22]:
CV

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [23]:
x = df['Message'].values

In [24]:
x

array(['Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amore wat',
       'Ok lar Joking wif u oni',
       'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s',
       ..., 'Pity  was in mood for that Soany other suggestions',
       'The guy did some bitching but I acted like id be interested in buying something else next week and he gave it to us for free',
       'Rofl Its true to its name'], dtype=object)

Split into train and test data

In [25]:
X_train , X_test , y_train , y_test = train_test_split(x,y)

In [26]:
X_train

array([' Was thinking about chuckin ur red green n black trainners 2 save carryin them bac on train',
       'Yup i thk they r e teacher said that will make my face look longer Darren ask me not 2 cut too short',
       'Send his number and give reply tomorrow morning for why you said that to him like that ok',
       ..., 'We are both fine Thanks',
       'Ard 530 lor I ok then message ü lor',
       'R we going with the  ltgt  bus'], dtype=object)

Data Munging
     
     Data Munging (or Data Wrangling), it means preparing your data for a dedicated purpose - taking the data from its raw state and transforming and mapping into another format, normally for use beyond its original inten
  

In [27]:
X_train_CV = CV.fit_transform(X_train)

     Naive Bayse :- Multinomial

In [28]:
NB = MultinomialNB()

In [29]:
NB.fit(X_train_CV,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [30]:
X_test_CV = CV.transform(X_test)


In [31]:
y_predict = NB.predict(X_test_CV)

In [32]:
result = accuracy_score(y_test,y_predict)

In [33]:
print("Accuracy of Prediction: ",result*100)

Accuracy of Prediction:  98.27709978463747


RealTime Application of Spam Filtering:-

In [34]:
email = input("Enter Email :- ")
body = input("Enter Body of Content :- ")
bodyInput = CV.transform([body])
result = NB.predict(bodyInput)
if(result[0]==1):
    print("Email is spam")
else:
    print("Email sent")

Enter Email :- parasbhalala77@gmail.com
Enter Body of Content :- you are won
Email is spam
