 Importing The Dependencies

In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Collection & Processing

In [27]:
raw_mail_data=pd.read_csv('mail_data.csv')
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [28]:
raw_mail_data.shape

(5572, 2)

In [29]:
mail_data=raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [30]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Label Encodng

In [31]:
#label spam as 0 and ham as 1
mail_data.loc[mail_data['Category']=='spam','Category']=0
mail_data.loc[mail_data['Category']=='ham','Category']=1

In [32]:
x=mail_data['Message']
y=mail_data['Category']

In [33]:
print(x)
print(y)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object
0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


In [34]:
#spliting data in test and train data
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=2)
print(x.shape, x_train.shape, x_test.shape)

(5572,) (4457,) (1115,)


In [35]:
print(y_test)

5086    1
2120    1
2318    1
2917    1
1352    1
       ..
884     1
3821    1
1066    1
208     1
1378    0
Name: Category, Length: 1115, dtype: object


Feature Extraction

In [36]:
#transform the text data to feature vectors that can be used as input in logistic regression
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase='True')

x_train_feature=feature_extraction.fit_transform(x_train)
x_test_feature=feature_extraction.transform(x_test)

#convert y_train and y_test as integer value
y_train=y_train.astype('int')
y_test=y_test.astype('int')

In [37]:
print(x_train)

3890                    Unlimited texts. Limited minutes.
5553                          Hahaha..use your brain dear
4366    Ujhhhhhhh computer shipped out with address to...
3968    YOU HAVE WON! As a valued Vodafone customer ou...
3771    Love it! The girls at the office may wonder wh...
                              ...                        
3335    That's fine, have him give me a call if he kno...
1099    NO GIFTS!! You trying to get me to throw mysel...
2514    U have won a nokia 6230 plus a free digital ca...
3606                      Jordan got voted out last nite!
2575    Your next amazing xxx PICSFREE1 video will be ...
Name: Message, Length: 4457, dtype: object


In [38]:
print(x_train_feature)

  (0, 4334)	0.42941702167641554
  (0, 3958)	0.6161071828926097
  (0, 6586)	0.44333254982109394
  (0, 6927)	0.48935591439341625
  (1, 2121)	0.3573617143022146
  (1, 1428)	0.5869421390016223
  (1, 6971)	0.42812434651556874
  (1, 3168)	0.5869421390016223
  (2, 5115)	0.3408491178137899
  (2, 7353)	0.31988118061968496
  (2, 3852)	0.3408491178137899
  (2, 4884)	0.35749230587184955
  (2, 5695)	0.35749230587184955
  (2, 806)	0.26730249393705324
  (2, 5894)	0.35749230587184955
  (2, 1876)	0.28751725124107325
  (2, 6878)	0.35749230587184955
  (3, 197)	0.36522237107066735
  (3, 3723)	0.16297045459835785
  (3, 2435)	0.26698378141852
  (3, 1825)	0.26858331513730566
  (3, 5231)	0.2266831802864503
  (3, 300)	0.2915969875465198
  (3, 7248)	0.23571908490908416
  (3, 5005)	0.3169028431039865
  :	:
  (4454, 2244)	0.2526916142542512
  (4454, 666)	0.28653660324238944
  (4454, 1575)	0.20946314330145205
  (4454, 1094)	0.24862733340971144
  (4454, 5068)	0.22284357632450164
  (4454, 311)	0.19547195974237946
  

Training the Model

Logistics Regression

In [40]:
model=LogisticRegression()

In [41]:
#training with logistics regression model with training data
model.fit(x_train_feature, y_train)

LogisticRegression()

Model Evaluation

Accuracy Score

In [42]:
#accuracy of training data
x_tarin_prediction = model.predict(x_train_feature)
print(x_tarin_prediction)

[1 1 1 ... 0 1 1]


In [43]:
training_data_accuracy = accuracy_score(y_train, x_tarin_prediction)
print('Accuracy of train data is', training_data_accuracy)

Accuracy of train data is 0.9683643706529056


In [44]:
#accuracy of test data
x_test_prediction = model.predict(x_test_feature)
print(x_test_prediction)

[1 1 1 ... 1 1 0]


In [45]:
testing_data_accuracy = accuracy_score(y_test, x_test_prediction)
print('Accuracy of test data is', testing_data_accuracy)

Accuracy of test data is 0.9524663677130045
