In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
df = pd.read_csv('mail_data.csv')
df.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [27]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [28]:
df.shape

(5572, 2)

---

In [29]:
# Encoding Variables

en_data = df['Category']
en_data.head(3)

0     ham
1     ham
2    spam
Name: Category, dtype: object

In [30]:
ar = pd.get_dummies(en_data).astype(int)

ar.head(4)

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0


In [31]:
df['spam'] = ar['spam']

In [32]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


---

In [33]:
X = df['Message']
Y = df['spam']

X.ndim, Y.ndim

(1, 1)

In [34]:
print(X), print(Y)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object
0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: spam, Length: 5572, dtype: int64


(None, None)

---

## Feature Extraction


In [35]:
# Transform text data into numerical format

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)
X = feature_extraction.fit_transform(X)

In [36]:
X.ndim

2

In [37]:
# print(X)

---

## Split data into Train and Test

In [38]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

---

## Train Model

In [39]:
model = LogisticRegression()
model.fit(X_train, Y_train)

### Model Evaluation

In [40]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(Y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9524663677130045


In [41]:
model.score(X_test, Y_test), model.score(X_train, Y_train)

(0.9524663677130045, 0.9670181736594121)

In [42]:
confusion_matrix(Y_test, y_pred)

array([[960,   0],
       [ 53, 102]])

In [43]:
classification_report(Y_test, y_pred)

'              precision    recall  f1-score   support\n\n           0       0.95      1.00      0.97       960\n           1       1.00      0.66      0.79       155\n\n    accuracy                           0.95      1115\n   macro avg       0.97      0.83      0.88      1115\nweighted avg       0.95      0.95      0.95      1115\n'

----

## Building an Predictive system

In [44]:
#input_mail = ["Hey John, I hope you're doing well. Just wanted to check in and see if you're available for a quick call tomorrow to discuss the project updates. Let me know what time works best for you. Looking forward to catching up!"]
input_mail = ["WINNER!! Congratulations, you have been selected for a FREE VIP membership at ExclusiveRewards! Claim your cash prize of $1,000 NOW. Hurry! Offer expires in 24 hours. Click here: [Claim Now](#). Reply STOP to unsubscribe."]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction==1):
  print('Spam mail')
else:
  print('Ham mail')

[1]
Spam mail


---

### Export

In [None]:
# import pickle 

# pickle.dump(feature_extraction, open('tfdif_vectorizer_spam_mail.pkl', 'wb'))
# pickle.dump(model, open('spam_mail_model.sav', 'wb'))

---