In [1]:
import numpy as np
import pandas as pd

### Data Collection & Pre-Processing

In [3]:
#Loading Data
raw_data = pd.read_csv(r"C:\Users\Abhishek Rathore\Desktop\ML Project\mail_data.csv")

In [4]:
raw_data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [6]:
raw_data.shape

(5572, 2)

Label Encoding

In [7]:
from sklearn.preprocessing import LabelEncoder
lbEncoder = LabelEncoder()

raw_data['Category'] = lbEncoder.fit_transform(raw_data['Category'])

In [8]:
raw_data

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


Making Input and Output Data Columns

In [9]:
x = raw_data['Message']
y = raw_data['Category']

In [10]:
x.shape

(5572,)

In [11]:
y.shape

(5572,)

In [12]:
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [13]:
y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Category, Length: 5572, dtype: int32

Splitting Training and Test dataset

In [14]:
from sklearn.model_selection import train_test_split

In [43]:
x_trn, x_test, y_trn, y_test = train_test_split(x, y, test_size=0.1, random_state=22)

In [44]:
print(x.shape)
print(x_trn.shape)
print(x_test.shape)

(5572,)
(5014,)
(558,)


In [45]:
print(y.shape)
print(y_trn.shape)
print(y_test.shape)

(5572,)
(5014,)
(558,)


Feature Extraction

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [47]:
#transform text data into numerical feature vectors
Ft_Extract = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True)

In [48]:
x_trn_features = Ft_Extract.fit_transform(x_trn)
x_test_features = Ft_Extract.transform(x_test)

#convert y data into integer type
y_trn = y_trn.astype('int')
y_test = y_test.astype('int')      

In [49]:
print(x_trn_features)

  (0, 4058)	0.5229100666631653
  (0, 3893)	0.48184833181355796
  (0, 4527)	0.48184833181355796
  (0, 3200)	0.3865059360414387
  (0, 5346)	0.3358907468865998
  (1, 6471)	0.5040400770069187
  (1, 5051)	0.3265907654777156
  (1, 6267)	0.27500913284740114
  (1, 5466)	0.31190275851693006
  (1, 1588)	0.422424309880243
  (1, 3241)	0.5365877572008048
  (2, 4124)	0.25956585763799667
  (2, 2392)	0.3511937888690559
  (2, 6516)	0.2524807843048989
  (2, 4426)	0.27885803045099256
  (2, 5628)	0.3391969664127109
  (2, 5548)	0.3158600542637824
  (2, 3389)	0.29338300079718127
  (2, 6430)	0.3158600542637824
  (2, 4137)	0.17151966904248508
  (2, 7028)	0.3391969664127109
  (2, 2525)	0.1912319576588942
  (2, 2134)	0.29338300079718127
  (3, 1323)	0.24500376413747926
  (3, 7610)	0.24500376413747926
  :	:
  (5011, 1684)	0.1822717257564065
  (5011, 4007)	0.20761221740403993
  (5011, 5366)	0.16462248546865843
  (5011, 2982)	0.16075045370813146
  (5011, 7091)	0.1786475417140692
  (5011, 3079)	0.17254895243237606
 

In [50]:
print(x_test_features)

  (0, 7491)	0.3504747072325481
  (0, 5834)	0.357997480440075
  (0, 4802)	0.5371871721732068
  (0, 4279)	0.5371871721732068
  (0, 2583)	0.41456624041752943
  (1, 7679)	0.5412334556363766
  (1, 7653)	0.4987598520040242
  (1, 5474)	0.5784296622710628
  (1, 5112)	0.3517443422911225
  (2, 7219)	0.3065292319086262
  (2, 5920)	0.5479276245636612
  (2, 5169)	0.5673068970269033
  (2, 3922)	0.5328958930328117
  (3, 7936)	0.3661756466260414
  (3, 7690)	0.5009680896265076
  (3, 7228)	0.36483907426308876
  (3, 4588)	0.5127864396783454
  (3, 1141)	0.46785562534485664
  (4, 6277)	0.2640084069493021
  (4, 5466)	0.23818932411579388
  (4, 5436)	0.27097134668744766
  (4, 5051)	0.24940604585058318
  (4, 4268)	0.25650780900086145
  (4, 4137)	0.20012942124971023
  (4, 3510)	0.3085928900756309
  :	:
  (551, 898)	0.3860161082247805
  (552, 6056)	0.8666859326179502
  (552, 3578)	0.4988541813016644
  (553, 7086)	0.4230267092767977
  (553, 3554)	0.3754177753394655
  (553, 3303)	0.39598178293510083
  (553, 1686)	

Training the Model

### Logistic Regression

In [51]:
from sklearn.linear_model import LogisticRegression

In [52]:
model = LogisticRegression()

In [53]:
model.fit(x_trn_features, y_trn)

Evaluating the Model

In [54]:
#prediction on training data
prediction = model.predict(x_test_features)

In [55]:
from sklearn.metrics import accuracy_score

In [56]:
accuracy = accuracy_score(y_test, prediction)
print('Accuracy : ',accuracy)

Accuracy :  0.967741935483871


In [57]:
input_mail = ["Sunshine Quiz Wkly Q! Win a top Sony DVD player if u know which country the Algarve is in? Txt ansr to 82277. £1.50 SP:Tyrone"]

#converting to feature vectors

input_features = Ft_Extract.transform(input_mail)

#making Prediction

prediction = model.predict(input_features)
print('Prediction: ', prediction)

if prediction[0]==0:
    print("Ham Mail!")
else:
    print("Spam Mail!")

Prediction:  [1]
Spam Mail!


Saving the Model

In [58]:
import pickle

In [59]:
filename = 'Spam_Filtration.sav'
pickle.dump(model, open(filename, 'wb'))
pickle.dump(Ft_Extract, open('FtExtractor.sav', 'wb'))