In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
raw_mail_data = pd.read_csv('/content/balanced_mail_data.csv')

In [3]:
print(raw_mail_data)

     Category                                            Message
0      normal              Does uncle timi help in clearing cars
1      normal                            Ya ok, then had dinner?
2      normal                Its ok, called mom instead have fun
3        spam  Had your mobile 10 mths? Update to latest Oran...
4      normal                            Watching ajith film ah?
...       ...                                                ...
9645     spam  Exclusive deal just for you! Unlock premium co...
9646   normal                             Sorry, I'll call later
9647   normal                         Nt joking seriously i told
9648   normal            Did he just say somebody is named tampa
9649     spam  Exclusive deal just for you! Unlock premium co...

[9650 rows x 2 columns]


In [4]:
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [5]:
mail_data.head()

Unnamed: 0,Category,Message
0,normal,Does uncle timi help in clearing cars
1,normal,"Ya ok, then had dinner?"
2,normal,"Its ok, called mom instead have fun"
3,spam,Had your mobile 10 mths? Update to latest Oran...
4,normal,Watching ajith film ah?


In [6]:
mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'normal', 'Category',] = 1

spam  -  0

normal  -  1

In [7]:
X = mail_data['Message']
Y = mail_data['Category']

In [8]:
print(X)

0                   Does uncle timi help in clearing cars
1                                 Ya ok, then had dinner?
2                     Its ok, called mom instead have fun
3       Had your mobile 10 mths? Update to latest Oran...
4                                 Watching ajith film ah?
                              ...                        
9645    Exclusive deal just for you! Unlock premium co...
9646                               Sorry, I'll call later
9647                           Nt joking seriously i told
9648              Did he just say somebody is named tampa
9649    Exclusive deal just for you! Unlock premium co...
Name: Message, Length: 9650, dtype: object


In [9]:
print(Y)

0       1
1       1
2       1
3       0
4       1
       ..
9645    0
9646    1
9647    1
9648    1
9649    0
Name: Category, Length: 9650, dtype: object


In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [11]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(9650,)
(7720,)
(1930,)


In [12]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True) 

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [13]:
print(X_train)

3418       I dunno until when... Lets go learn pilates...
8929    Win an iPhone 14! Just answer a few simple que...
1417    Baaaaaaaabe! Wake up ! I miss you ! I crave yo...
9484    Have a lovely night and when you wake up to se...
5270                             K..k.:)congratulation ..
                              ...                        
5734    You've been selected for a free vacation. Clai...
5191    Breaking news! Click to see the latest celebri...
5390    Win an iPhone 14! Just answer a few simple que...
860     What i told before i tell. Stupid hear after i...
7270    Tomarrow i want to got to court. At  &lt;DECIM...
Name: Message, Length: 7720, dtype: object


In [14]:
print(X_train_features)

  (0, 2461)	0.43501225545849453
  (0, 4113)	0.48543383772898013
  (0, 4084)	0.5362454320498347
  (0, 5183)	0.5362454320498347
  (1, 7391)	0.3191897590392048
  (1, 3767)	0.32998103755395264
  (1, 304)	0.3295008935319374
  (1, 3907)	0.24562478938677473
  (1, 983)	0.32598941233464923
  (1, 6106)	0.327842874642817
  (1, 5488)	0.3276088138141544
  (1, 3566)	0.14744397505748041
  (1, 2737)	0.5060384875305659
  (1, 1873)	0.146409622769892
  (2, 1169)	0.5823088611049883
  (2, 7243)	0.4314952493004759
  (2, 4526)	0.3738659123129297
  (2, 2039)	0.4810846136147973
  (2, 4733)	0.3217298777907543
  (3, 7243)	0.3531603401204358
  (3, 4256)	0.37642680376416104
  (3, 4778)	0.2823875196529942
  (3, 4477)	0.289345799696391
  (3, 3524)	0.2823875196529942
  (3, 6183)	0.33719475190916154
  :	:
  (7717, 983)	0.32561994475893236
  (7717, 6106)	0.3274713064030915
  (7717, 5488)	0.32723751085263636
  (7717, 3566)	0.1472768660473184
  (7717, 1873)	0.1462436860666138
  (7717, 2767)	0.5077009792109456
  (7718, 66

In [15]:
model = LogisticRegression()
model.fit(X_train_features, Y_train)

In [16]:
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [17]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9845854922279793


In [18]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [19]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9818652849740933


In [23]:
input_mail = ["Your account has been temporarily suspended due to suspicious activity. To reactivate it, please click the link below and verify your information immediately. If you don’t respond, your account will remain suspended."]
input_data_features = feature_extraction.transform(input_mail)
prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Normal mail')

else:
  print('Spam mail')

[0]
Spam mail


In [22]:
# input_mail = input("Enter email message : ")

# # convert text to feature vectors
# # The following line is changed to pass a list containing the input string.
# input_data_features = feature_extraction.transform([input_mail])

# # making prediction

# prediction = model.predict(input_data_features)
# print(prediction)


# if (prediction[0]==1):
#   print('Normal mail')

# else:
#   print('Spam mail')

Enter email message : Hi Rahul, we've noticed unusual activity on your account and need to verify some details. Please confirm your information here to keep your account secure.
[1]
Normal mail


In [23]:
# import numpy as np
# import pandas as pd
# import re
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score

# # Load dataset
# mail_data = pd.read_csv('/content/balanced_mail_data.csv')
# mail_data = mail_data.where((pd.notnull(mail_data)), '')
# # mail_data['Category'] = mail_data['Category'].replace({'ham': 'normal'})

# # TF-IDF Vectorization with bigrams
# vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
# x_features = vectorizer.fit_transform(mail_data['Message'])

# # Prepare target labels
# y = mail_data['Category'].apply(lambda x: 1 if x == 'normal' else 0)

# # Split data into training and test sets
# x_train, x_test, y_train, y_test = train_test_split(x_features, y, test_size=0.2, random_state=42)

# # Train a Random Forest model
# model = RandomForestClassifier(n_estimators=100, random_state=42)
# model.fit(x_train, y_train)

# # Predictions & Accuracy
# predictions = model.predict(x_test)
# accuracy = accuracy_score(y_test, predictions)
# print(f'Model Accuracy: {accuracy * 100:.2f}%')

# # Function to predict phishing emails
# def classify_email(email_text):
#     email_features = vectorizer.transform([email_text])
#     prediction = model.predict(email_features)
#     return 'Normal mail' if prediction[0] == 1 else 'Phishing mail'

# # Example Test
# email = input("Enter input message : ")
# print(classify_email(email))


Model Accuracy: 98.34%
Enter input message : Hi Rahul, we've noticed unusual activity on your account and need to verify some details. Please confirm your information here to keep your account secure.
Normal mail


In [24]:
import pickle
model_file_path = "phishing_detection_model.pkl"
with open(model_file_path, 'wb') as model_file:
    pickle.dump(model, model_file)

print(f"Model saved successfully as {model_file_path}")

Model saved successfully as phishing_detection_model.pkl


In [25]:
vectorizer_file_path = "tfidf_vectorizer.pkl"
with open(vectorizer_file_path, 'wb') as vectorizer_file:
    pickle.dump(feature_extraction, vectorizer_file)

print(f"TF-IDF vectorizer saved successfully as {vectorizer_file_path}")

TF-IDF vectorizer saved successfully as tfidf_vectorizer.pkl
