In [45]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [46]:
# loading csv file to read data from file
raw_mail_data = pd.read_csv('/content/balanced_spam_ham_5000.csv')

In [47]:
print(raw_mail_data)

     Category                                            Message
0        spam      Urgent: Update your bank details immediately.
1        spam      Win a brand new iPhone today, click the link.
2        spam      Earn money from home, no experience required.
3        spam  Congratulations! You have been selected for a ...
4        spam      Earn money from home, no experience required.
...       ...                                                ...
4995      ham           Please find the project report attached.
4996     spam      Win a brand new iPhone today, click the link.
4997     spam        Final notice: Pay your bill or lose access.
4998     spam      Earn money from home, no experience required.
4999      ham           Please find the project report attached.

[5000 rows x 2 columns]


In [48]:
# replace the null values with a null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [49]:
mail_data.head()

Unnamed: 0,Category,Message
0,spam,Urgent: Update your bank details immediately.
1,spam,"Win a brand new iPhone today, click the link."
2,spam,"Earn money from home, no experience required."
3,spam,Congratulations! You have been selected for a ...
4,spam,"Earn money from home, no experience required."


In [50]:
print(mail_data['Category'].value_counts())
print(mail_data['Category'].value_counts(normalize=True))  # % distribution


Category
spam    2500
ham     2500
Name: count, dtype: int64
Category
spam    0.5
ham     0.5
Name: proportion, dtype: float64


In [51]:
# checking its shape
mail_data.shape

(5000, 2)

In [52]:
# label spam mail as 0 and  ham mail as 1

mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 1

In [53]:
# separating the data as texts and label

X = mail_data['Message']

Y = mail_data['Category']

In [54]:
print(X)

0           Urgent: Update your bank details immediately.
1           Win a brand new iPhone today, click the link.
2           Earn money from home, no experience required.
3       Congratulations! You have been selected for a ...
4           Earn money from home, no experience required.
                              ...                        
4995             Please find the project report attached.
4996        Win a brand new iPhone today, click the link.
4997          Final notice: Pay your bill or lose access.
4998        Earn money from home, no experience required.
4999             Please find the project report attached.
Name: Message, Length: 5000, dtype: object


In [55]:
print(Y)

0       0
1       0
2       0
3       0
4       0
       ..
4995    1
4996    0
4997    0
4998    0
4999    1
Name: Category, Length: 5000, dtype: object


**Splitting data into training and testing data**

In [56]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [57]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5000,)
(4000,)
(1000,)


Feature extraction

In [58]:
# transfrom text data to vectors which can we used for model training

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values into integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [59]:
print(X_train)

868              Let’s catch up over coffee this weekend.
936        Don’t forget to bring the documents for class.
4034             You won a lottery! Claim your prize now.
1923    Get cheap medicines without prescription, orde...
3330    Congratulations! You have been selected for a ...
                              ...                        
3335       Don’t forget to bring the documents for class.
1099          Are you coming to the office party tonight?
2514       Don’t forget to bring the documents for class.
3606              Hi John, are we still meeting tomorrow?
2575        Win a brand new iPhone today, click the link.
Name: Message, Length: 4000, dtype: object


In [60]:
print(X_train_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 17917 stored elements and shape (4000, 83)>
  Coords	Values
  (0, 40)	0.5
  (0, 11)	0.5
  (0, 16)	0.5
  (0, 77)	0.5
  (1, 23)	0.447213595499958
  (1, 29)	0.447213595499958
  (1, 8)	0.447213595499958
  (1, 22)	0.447213595499958
  (1, 14)	0.447213595499958
  (2, 80)	0.5192181009523815
  (2, 45)	0.5192181009523815
  (2, 13)	0.43730731863325556
  (2, 60)	0.5192181009523815
  (3, 12)	0.5
  (3, 47)	0.5
  (3, 59)	0.5
  (3, 54)	0.5
  (4, 19)	0.5
  (4, 65)	0.5
  (4, 31)	0.5
  (4, 10)	0.5
  (5, 28)	0.4472135954999579
  (5, 51)	0.4472135954999579
  (5, 57)	0.4472135954999579
  (5, 44)	0.4472135954999579
  :	:
  (3995, 23)	0.447213595499958
  (3995, 29)	0.447213595499958
  (3995, 8)	0.447213595499958
  (3995, 22)	0.447213595499958
  (3995, 14)	0.447213595499958
  (3996, 17)	0.5
  (3996, 53)	0.5
  (3996, 56)	0.5
  (3996, 71)	0.5
  (3997, 23)	0.447213595499958
  (3997, 29)	0.447213595499958
  (3997, 8)	0.447213595499958
  (3997, 22)	0.447

# **Model Training**

Logistic regression for this email fraud detection

In [61]:
model = LogisticRegression()

In [62]:
# training  model with training data
model.fit(X_train_features, Y_train)

**Evaluating the model**

In [63]:
# prediction on training data

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [64]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  1.0


In [65]:
# prediction on test data

prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [66]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  1.0


# Building Predictive System

In [69]:
input_mail = ["Your bank account has been temporarily suspended.Please verify your details immediately to restore access"]


# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[0]
Spam mail
