## Importing the depandencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


## Data collection & Pre-processing

In [2]:
#loading the dataset
rmail_df = pd.read_csv('mail_data.csv')

In [3]:
rmail_df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
rmail_df.size


11144

In [5]:
# replace the null values with a new value 
mail_df = rmail_df.where((pd.notnull(rmail_df)),'')

In [6]:
mail_df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# find number of row and coloum in the dataframe
mail_df.shape

(5572, 2)

## Label Encoding

In [8]:
# labeling spam mail as 0 and ham mail as 1
mail_df.loc[mail_df['Category'] == 'spam', 'Category'] = 0
mail_df.loc[mail_df['Category'] == 'ham', 'Category'] = 1


In [9]:
# separating the data as text and label
x = mail_df['Message']
y =  mail_df['Category']

In [10]:
print(x)
print(y)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object
0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


## Splitting the data into training & test data

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify=y, random_state=3)
print(x.shape, x_train.shape, x_test.shape)

(5572,) (4457,) (1115,)


In [12]:
print(x_train)
print(y_train)

3501    Dorothy@kiefer.com (Bank of Granite issues Str...
617     He like not v shock leh. Cos telling shuhui is...
475     Nice line said by a broken heart- Plz don't cu...
5535    I know you are thinkin malaria. But relax, chi...
4747           Orh i tot u say she now still dun believe.
                              ...                        
4402         Many times we lose our best ones bcoz we are
3615                                         Ok c ü then.
4763                      Me too! Have a lovely night xxx
4339                          Yes when is the appt again?
1827    Dude. What's up. How Teresa. Hope you have bee...
Name: Message, Length: 4457, dtype: object
3501    0
617     1
475     1
5535    1
4747    1
       ..
4402    1
3615    1
4763    1
4339    1
1827    1
Name: Category, Length: 4457, dtype: object


## Feature Extraction

In [13]:
# transform the text to feature vector that can be used as input to the Logistic regression
feature_extraction = TfidfVectorizer(min_df = 1,stop_words= 'english')
x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

# convert y_train and y_test values as integer
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [14]:
print(x_train)

3501    Dorothy@kiefer.com (Bank of Granite issues Str...
617     He like not v shock leh. Cos telling shuhui is...
475     Nice line said by a broken heart- Plz don't cu...
5535    I know you are thinkin malaria. But relax, chi...
4747           Orh i tot u say she now still dun believe.
                              ...                        
4402         Many times we lose our best ones bcoz we are
3615                                         Ok c ü then.
4763                      Me too! Have a lovely night xxx
4339                          Yes when is the appt again?
1827    Dude. What's up. How Teresa. Hope you have bee...
Name: Message, Length: 4457, dtype: object


In [15]:
print(x_train_features)

  (0, 0)	0.23628394623676158
  (0, 1657)	0.28101404009316056
  (0, 6468)	0.26793132631329497
  (0, 4557)	0.28101404009316056
  (0, 421)	0.25144905621529934
  (0, 4306)	0.26793132631329497
  (0, 5029)	0.17467075796896542
  (0, 2644)	0.28101404009316056
  (0, 1540)	0.17407870571957915
  (0, 6330)	0.24059246244542992
  (0, 3627)	0.25144905621529934
  (0, 3113)	0.28101404009316056
  (0, 1193)	0.22908400928709988
  (0, 1857)	0.17073786814794129
  (0, 3806)	0.28101404009316056
  (0, 2353)	0.28101404009316056
  (1, 4076)	0.1543395674723974
  (1, 5416)	0.28967873139399253
  (1, 6977)	0.1293522168838017
  (1, 765)	0.21147006367289747
  (1, 1068)	0.1771111381363262
  (1, 3089)	0.13752009582621935
  (1, 3961)	0.20073435617244362
  (1, 3828)	0.13684128003316173
  (1, 2113)	0.19851614641109666
  :	:
  (4456, 3913)	0.23883125341667502
  (4456, 2138)	0.23883125341667502
  (4456, 6568)	0.23883125341667502
  (4456, 6646)	0.22771237505351186
  (4456, 5447)	0.2198234053076842
  (4456, 4878)	0.21370424497

## Model Training use Logistic Regression

In [16]:
model = LogisticRegression()
# training the logistic regression model with taining data
model.fit(x_train_features, y_train)

## Evaluating the trained model

In [17]:
# prediction on training data
prediction_on_training_data = model.predict(x_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [18]:
print('Accuracy on training data = ', accuracy_on_training_data)

Accuracy on training data =  0.9667938074938299


In [19]:
# prediction on test data
prediction_on_test_data = model.predict(x_test_features)
accuracy_on_test_data = accuracy_score(y_test, prediction_on_test_data)

In [20]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9713004484304932


# Building a Predictive System

In [23]:
input_mail = ["hey my name Abhimanyu jangid. i hope this mail is you find you"]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Human mail')

else:
  print('Spam mail')

[1]
Human mail
