In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC



In [19]:
#Data PreProcessing
#Load the dataset to pandas DataFrame
raw_mail_data = pd.read_csv('spammaildata .csv')

In [20]:
#replace the values with a null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [21]:
mail_data.shape

(5570, 2)

In [22]:
mail_data.head()#sample data

Unnamed: 0,Category,Message
0,ham,Ok lar... Joking wif u oni...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,spam,FreeMsg Hey there darling it's been 3 week's n...
4,ham,Even my brother is not like to speak with me. ...


In [23]:
#label spam mail as 0 , Non-spam mail(ham) mail as 1
mail_data.loc[mail_data['Category'] == 'spam','Category'] = 0

mail_data.loc[mail_data['Category'] == 'ham','Category'] = 1

In [24]:
#separate the data as Train and test
X = mail_data['Message']
Y = mail_data['Category']

In [25]:
print(X.head())
print(Y.head())

0                        Ok lar... Joking wif u oni...
1    Free entry in 2 a wkly comp to win FA Cup fina...
2    Nah I don't think he goes to usf, he lives aro...
3    FreeMsg Hey there darling it's been 3 week's n...
4    Even my brother is not like to speak with me. ...
Name: Message, dtype: object
0    1
1    0
2    1
3    0
4    1
Name: Category, dtype: object


In [26]:
#Train test split
X_train, X_test, Y_train,Y_test = train_test_split(X,Y, train_size=0.8,test_size=0.2,random_state=3)

In [27]:
X_train.shape

(4456,)

In [28]:
X_test.shape

(1114,)

In [29]:
#feature Extraction
#Transform the text data to feature vectors that can be used as input to the svm model using TfidVectorizer
#Convert the text to lower  case letters
feature_extraction = TfidfVectorizer(min_df=1,stop_words='english',lowercase='True')
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [30]:
#Convert Y_train and Y_test values as integers
Y_train = Y_train.astype(int)
Y_test = Y_test.astype(int)

In [31]:
#Training the model --> Support vector Machine
model = LinearSVC()
model.fit(X_train_features,Y_train)


LinearSVC()

In [32]:
#Evaluation of the model
#Prediction on Training data
from sklearn.metrics import accuracy_score
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train,prediction_on_training_data)

In [33]:
print("Acuuracy on training data : ", accuracy_on_training_data) 

Acuuracy on training data :  0.9997755834829444


In [35]:
prediction_on_testing_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test,prediction_on_testing_data)

In [36]:
print("Accuracy on test data : ",accuracy_on_test_data)

Accuracy on test data :  0.9784560143626571


In [37]:
#prediction for NEW MAIL
input_mail =["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]
#Convert text to feature vectors
input_mail_features = feature_extraction.transform(input_mail)

In [38]:
prediction = model.predict(input_mail_features)
print(prediction)

[0]


In [39]:
if prediction[0] == 0:
    print("It's SPAM MAIL")
else:
    print("It's HAM MAIL")

It's SPAM MAIL
