In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

data = pd.read_csv(r"C:\Users\Anjali\Desktop\spam_detection\mail_data.csv")

In [2]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
mail_data = data.where(pd.notnull(data), '')

In [4]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
mail_data.shape

(5572, 2)

Label Encoding 

In [6]:

from sklearn.preprocessing import LabelEncoder
mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 1

Seperating the data as text and labels 

In [7]:
X = mail_data['Message']
y = mail_data['Category']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)#same manner
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(4457,) (1115,) (4457,) (1115,)


In [9]:
#transform text data to feature vectors to be used as input 
vectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
#repeated values get a score according to its occurance in the data
X_train_feature = vectorizer.fit_transform(X_train)
X_test_feature = vectorizer.transform(X_test)

In [10]:
print(X_train_feature)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

In [11]:
#covert y to integers
y_train = y_train.astype('int')
print(y_train)
y_test = y_test.astype('int')

3075    1
1787    1
1614    1
4304    1
3266    0
       ..
789     0
968     1
1667    1
3321    1
1688    0
Name: Category, Length: 4457, dtype: int32


In [12]:
X_train_feature = pd.DataFrame(X_train_feature.toarray())
X_test_feature = pd.DataFrame(X_test_feature.toarray())
#find variance
print(X_train_feature.var())

0       0.000107
1       0.000413
2       0.000013
3       0.000018
4       0.000016
          ...   
7426    0.000017
7427    0.000006
7428    0.000008
7429    0.000012
7430    0.000037
Length: 7431, dtype: float64


In [13]:
variance = X_train_feature.var()
columns = X_train_feature.columns

In [14]:
variable = []
for i in range(len(variance)):
    if variance[i] > 0.001:
        variable.append(columns[i])
    

In [15]:
print(variable)

[861, 918, 1047, 1140, 1507, 1517, 1584, 1760, 1837, 1843, 1929, 2065, 2103, 2113, 2224, 2253, 2319, 2329, 2332, 2411, 2756, 2759, 2870, 3004, 3053, 3063, 3065, 3081, 3113, 3133, 3156, 3167, 3208, 3229, 3267, 3281, 3285, 3332, 3348, 3466, 3470, 3677, 3722, 3811, 3880, 3882, 3912, 3923, 3935, 3951, 3962, 4014, 4038, 4044, 4060, 4080, 4104, 4113, 4178, 4267, 4269, 4298, 4324, 4350, 4386, 4413, 4428, 4456, 4557, 4582, 4594, 4602, 4674, 4715, 4724, 4729, 4734, 4842, 4986, 5000, 5028, 5033, 5056, 5204, 5220, 5382, 5391, 5497, 5560, 5653, 5696, 5778, 5785, 5946, 5981, 6082, 6091, 6123, 6249, 6380, 6452, 6507, 6541, 6556, 6557, 6562, 6587, 6589, 6605, 6637, 6677, 6686, 6697, 6704, 6916, 6976, 7067, 7070, 7085, 7090, 7109, 7113, 7124, 7150, 7171, 7238, 7288, 7292, 7355, 7368, 7379, 7390, 7414]


In [16]:
new_data  = X_train_feature[variable]
new_data_test = X_test_feature[variable]

In [17]:
print(new_data)

      861       918   1047  1140  1507  1517  1584  1760      1837  1843  ...  \
0      0.0  0.000000   0.0   0.0   0.0   0.0   0.0   0.0  0.000000   0.0  ...   
1      0.0  0.228716   0.0   0.0   0.0   0.0   0.0   0.0  0.000000   0.0  ...   
2      0.0  0.000000   0.0   0.0   0.0   0.0   0.0   0.0  0.000000   0.0  ...   
3      0.0  0.000000   0.0   0.0   0.0   0.0   0.0   0.0  0.000000   0.0  ...   
4      0.0  0.000000   0.0   0.0   0.0   0.0   0.0   0.0  0.000000   0.0  ...   
...    ...       ...   ...   ...   ...   ...   ...   ...       ...   ...  ...   
4452   0.0  0.000000   0.0   0.0   0.0   0.0   0.0   0.0  0.000000   0.0  ...   
4453   0.0  0.000000   0.0   0.0   0.0   0.0   0.0   0.0  0.362248   0.0  ...   
4454   0.0  0.000000   0.0   0.0   0.0   0.0   0.0   0.0  0.000000   0.0  ...   
4455   0.0  0.000000   0.0   0.0   0.0   0.0   0.0   0.0  0.000000   0.0  ...   
4456   0.0  0.000000   0.0   0.0   0.0   0.0   0.0   0.0  0.000000   0.0  ...   

          7150  7171  7238 

In [18]:
#logistic regression from scratch 
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def logistic_regression(X, y, learning_rate, num_iterations):
    #initialize theta
    theta = np.zeros(X.shape)
    #iterate through the number of iterations
    for i in range(num_iterations):
        #prediction
        y_pred = sigmoid(np.dot(X, theta))
        #update theta
        theta = theta - learning_rate * np.dot(X.T, (y_pred - y))
    return theta




In [19]:
print(X_train_feature.shape)

(4457, 7431)


In [20]:
#import logistic regression
from sklearn.linear_model import LogisticRegression
#create a logistic regression model
logreg = LogisticRegression()
#fit the model
logreg.fit(X_train_feature, y_train)

LogisticRegression()

In [21]:
#accuracy score of training data
print(logreg.score(X_train_feature, y_train))


0.9670181736594121


In [22]:
#prediction on training data 
y_pred = logreg.predict(X_train_feature)
print(y_pred)
#prediction on test data
y_pred = logreg.predict(X_test_feature)
print(y_pred)
#confusion matrix
# confusion_matrix = confusion_matrix(y_test, y_pred)
# print(confusion_matrix)
#classification report
print(classification_report(y_test, y_pred))
#accuracy score
print(accuracy_score(y_test, y_pred))


[1 1 1 ... 1 1 0]
[0 1 1 ... 1 1 1]
              precision    recall  f1-score   support

           0       1.00      0.75      0.86       155
           1       0.96      1.00      0.98       960

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

0.9659192825112107


In [23]:
#naive bayes
def gaussian(x, mean, std):
    exponent = np.exp(-(x-mean)**2/(2*std**2))
    return (1 / (np.sqrt(2*np.pi) * std)) * exponent

def naive_bayes(x, mean, std, p):
    return p * gaussian(x, mean, std)


In [24]:
#training the model 
model = naive_bayes(X_train_feature, y_train, 1, 0.5)

In [25]:
print(model)


          0         1         2         3         4         5     6     \
0     0.120985  0.120985  0.199471  0.120985  0.120985  0.199471   NaN   
1     0.120985  0.120985  0.199471  0.120985  0.120985  0.199471   NaN   
2     0.120985  0.120985  0.199471  0.120985  0.120985  0.199471   NaN   
3     0.120985  0.120985  0.199471  0.120985  0.120985  0.199471   NaN   
4     0.120985  0.120985  0.199471  0.120985  0.120985  0.199471   NaN   
...        ...       ...       ...       ...       ...       ...   ...   
4452  0.120985  0.120985  0.199471  0.120985  0.120985  0.199471   NaN   
4453  0.120985  0.120985  0.199471  0.120985  0.120985  0.199471   NaN   
4454  0.120985  0.120985  0.199471  0.120985  0.120985  0.199471   NaN   
4455  0.120985  0.120985  0.199471  0.120985  0.120985  0.199471   NaN   
4456  0.120985  0.120985  0.199471  0.120985  0.120985  0.199471   NaN   

          7         8     9     ...  7421  7422  7423  7424  7425  7426  7427  \
0     0.120985  0.199471   NaN

In [26]:
#find accuracy
# y_pred = model.predict(X_train_feature)
# print(y_pred)

In [27]:
#import naive bayes
from sklearn.naive_bayes import GaussianNB
#create a naive bayes model
gnb = GaussianNB()
#fit the model
gnb.fit(X_train_feature, y_train)


GaussianNB()

In [28]:

y_pred = gnb.predict(X_train_feature)
print(y_pred)

[0 1 1 ... 1 1 0]


In [29]:
#accuracy score
print(accuracy_score(y_train, y_pred))

0.934709445815571


In [30]:
#on test data
y_pred = gnb.predict(X_test_feature)
print(y_pred)

[0 1 0 ... 1 1 0]


In [31]:
#accuracy score
print(accuracy_score(y_test, y_pred))

0.8878923766816144


In [47]:
input_mail = ["awesxtvygbuhnijmok,pcfvghjuioko"]
input_mail = np.array(input_mail)
input_mail_feature = vectorizer.transform(input_mail)
input_mail_feature = pd.DataFrame(input_mail_feature.toarray())

In [48]:
#predict the input mail
prediction = gnb.predict(input_mail_feature)
print(prediction)


[0]


In [40]:
prediction_2 = logreg.predict(input_mail_feature)
print(prediction_2)

[1]


In [35]:
import joblib
joblib.dump(logreg, 'logreg.pkl')
joblib.dump(gnb, 'gnb.pkl')


['gnb.pkl']

In [36]:
from flask import Flask
app = Flask(__name__)
@app.route('/predict', methods=['POST'])
def predict():
    input_mail = request.form['input_mail']
    input_mail = np.array([input_mail])
    input_mail_feature = vectorizer.transform(input_mail)
    input_mail_feature = pd.DataFrame(input_mail_feature.toarray())
    prediction = gnb.predict(input_mail_feature)
    prediction_2 = logreg.predict(input_mail_feature)
    return str(prediction)
    


In [37]:
#import joblib


classifier = joblib.load('logreg.pkl')
prediction = classifier.predict(input_mail_feature)

