Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

Data collection and preprocessing

In [2]:
mail_data = pd.read_csv('C://Users//akash//Downloads//Downloads//mail_data.csv')

In [3]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#replace the null values with null string

In [5]:
mail_data1 = mail_data.where((pd.notnull(mail_data)),'')
mail_data1.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
mail_data1.shape

(5572, 2)

Label Encoding

In [7]:
#label ham mail as 1 and spam mail as 0;

In [8]:
mail_data1.loc[mail_data1['Category'] =='spam' , 'Category',] = 0
mail_data1.loc[mail_data1['Category'] =='ham' , 'Category',] = 1

In [9]:
mail_data1.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
#seperating the data as text and labels

In [11]:
X = mail_data1['Message']
y = mail_data1['Category']

In [12]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [13]:
print(y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


In [14]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=3,stratify=y)

In [15]:
print(X.shape , X_train.shape , X_test.shape)

(5572,) (3900,) (1672,)


Feature Extraction

In [16]:
#transform the text data to feature vectors

In [17]:
feature_extraction = TfidfVectorizer(min_df=1,stop_words='english',lowercase='True')

In [18]:
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [19]:
#convert y_test and y_train to integers

In [20]:
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [21]:
print(X_train_features)

  (0, 2519)	0.3641968313719766
  (0, 4129)	0.27179293265131954
  (0, 6632)	0.21401040065599355
  (0, 2209)	0.3839460075352549
  (0, 6029)	0.3229271034152822
  (0, 5735)	0.34444765520869836
  (0, 6414)	0.339315811943346
  (0, 6687)	0.23742642359011534
  (0, 6213)	0.3979582710179898
  (0, 2847)	0.21787810322301548
  (1, 3117)	0.43638873186978966
  (1, 5408)	0.6690772949333789
  (1, 6133)	0.6015816221442515
  (2, 4788)	0.3315528562552779
  (2, 6265)	0.23549196165793967
  (2, 6828)	0.30154914603644467
  (2, 2926)	0.17691957442728626
  (2, 3830)	0.17691957442728626
  (2, 4673)	0.29365713794622456
  (2, 3325)	0.31120818330674616
  (2, 2694)	0.24263763739852487
  (2, 2594)	0.35366455838389094
  (2, 3545)	0.32366084816505775
  (2, 2949)	0.29019239297787963
  (2, 4528)	0.35366455838389094
  :	:
  (3894, 3486)	0.17639883874980888
  (3894, 2677)	0.19595745329569172
  (3894, 3704)	0.19799799011751754
  (3894, 3560)	0.19482995679684487
  (3894, 6912)	0.23370254678346536
  (3895, 3000)	0.41121301945

In [22]:
print(X_test_features)

  (0, 6824)	0.3544732268849484
  (0, 6797)	0.27847798981605437
  (0, 6632)	0.2349166623215086
  (0, 5337)	0.36271498534288454
  (0, 5291)	0.2843163953888462
  (0, 4612)	0.4585124789163523
  (0, 4312)	0.2633278487159964
  (0, 3749)	0.21392281546534248
  (0, 3704)	0.22489733148439828
  (0, 899)	0.3915327561998774
  (1, 6314)	0.46523655015574283
  (1, 3704)	0.22819544385425986
  (1, 1567)	0.8552670879925992
  (2, 5406)	0.4209359421083416
  (2, 4375)	0.4922719376131311
  (2, 2902)	0.47342397443160406
  (2, 717)	0.5969514322902327
  (3, 6797)	0.18355544232288432
  (3, 6473)	0.1375468777525989
  (3, 5851)	0.16543201615972547
  (3, 4667)	0.3022230264361137
  (3, 2930)	0.20451364121435192
  (3, 2799)	0.3022230264361137
  (3, 2677)	0.14671066901671276
  (3, 1647)	0.18064128608566513
  :	:
  (1669, 6887)	0.283412076857172
  (1669, 5448)	0.40114178801358985
  (1669, 4294)	0.3255103991950951
  (1669, 3805)	0.42104895018330063
  (1669, 2863)	0.42104895018330063
  (1669, 1592)	0.3671102695057048
  (

Logistic Regression

In [23]:
model = LogisticRegression()

In [24]:
modelsvm = SVC()

In [25]:
#training the logistic regression model with the training data
model.fit(X_train_features,y_train)

LogisticRegression()

In [26]:
modelsvm.fit(X_train_features, y_train)

SVC()

Evaluating the trained model

In [27]:
#prediction on trained data

In [28]:
train_data_prediction = model.predict(X_train_features)
train_data_accuracy = accuracy_score(y_train,train_data_prediction)

In [29]:
print(train_data_accuracy)

0.9653846153846154


In [30]:
#prediction on test data

In [31]:
test_data_prediction = model.predict(X_test_features)

In [32]:
test_svm = modelsvm.predict(X_test_features)
test_data_accuracy_svm = accuracy_score(y_test , test_svm)
print(test_data_accuracy_svm)

0.9820574162679426


In [33]:
test_data_accuracy = accuracy_score(y_test , test_data_prediction)

In [34]:
print(test_data_accuracy)

0.9694976076555024


# Predictor System

In [35]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times."]

#convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

#making prediction
prediction = model.predict(input_data_features)
if prediction[0] == 1:
    print("This is a Ham mail.")
else:
    print("This is a Spam mail.")

This is a Ham mail.


In [36]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times." , " Ok lar... Joking wif u oni..."]
for mails in input_mail:
    #convert text to feature vectors
    input_data_features = feature_extraction.transform(input_mail)

    #making prediction
    prediction = model.predict(input_data_features)
    if prediction[0] == 1:
        print("This is a Ham mail.")
    else:
        print("This is a Spam mail.")

This is a Ham mail.
This is a Ham mail.


In [None]:
lst = []
n = int(input("Enter number of mails : "))
 
# iterating till the range
for i in range(0, n):
    mails = input()
    lst.append(mails) 
    
for mail in lst:
#convert text to feature vectors
    input_data_features = feature_extraction.transform(lst)

#making prediction
    prediction = model.predict(input_data_features)
    if prediction[0] == 1:
        print("This is a Ham mail.")
    else:
        print("This is a Spam mail.")