In [None]:
# Import the libraries that is needed for the algorithm

import pandas as pd # use for data manipulation and analysis
import numpy as np # use for multi-dimensional array and matrix

import seaborn as sbn # use for high-level interface for drawing attractive and informative statistical graphics 
import matplotlib.pyplot as plt # It provides an object-oriented API for embedding plots into applications
%matplotlib inline 
# It sets the backend of matplotlib to the 'inline' backend:
import time # calculate time 

from sklearn.linear_model import LogisticRegression # algo use to predict good or bad
from sklearn.naive_bayes import MultinomialNB # nlp algo use to predict good or bad

from sklearn.model_selection import train_test_split # spliting the data between feature and target
from sklearn.metrics import classification_report # gives whole report about metrics (e.g, recall,precision,f1_score,c_m)
from sklearn.metrics import confusion_matrix # gives info about actual and predict
from nltk.tokenize import RegexpTokenizer # regexp tokenizers use to split words from text  
from nltk.stem.snowball import SnowballStemmer # stemmes words
from sklearn.feature_extraction.text import CountVectorizer # create sparse matrix of words using regexptokenizes  
from sklearn.pipeline import make_pipeline # use for combining all prerocessors techniuqes and algos


##*********************************************************************************************

import pickle # use to dump model 

import warnings
warnings.filterwarnings('ignore')

In [None]:
phish_website_data = pd.read_csv('phishing_site_urls.csv')

In [None]:
phish_website_data.head()

In [None]:
phish_website_data.info()

In [None]:
phish_website_data.isnull().sum()

In [None]:
l_counts = pd.DataFrame(phish_website_data.Label.value_counts())

In [None]:
sbn.set_style('darkgrid')
sbn.barplot(l_counts.index,l_counts.Label)

In [None]:
token = RegexpTokenizer(r'[A-Za-z]+')

In [None]:
phish_website_data.URL[0]

In [None]:
token.tokenize(phish_website_data.URL[0])

In [None]:
print('Getting words tokenized ...')
t0= time.perf_counter()
phish_website_data['text_tokenized'] = phish_website_data.URL.map(lambda t: token.tokenize(t))
t1 = time.perf_counter() - t0
print('Time taken',t1 ,'sec')

In [None]:
phish_website_data.sample(5)

In [None]:
snowstem = SnowballStemmer("english")

In [None]:
phish_website_data.sample(5)

In [None]:
print('Getting words stemmed ...')
t0= time.perf_counter()
phish_website_data['text_stemmed'] = phish_website_data['text_tokenized'].map(lambda l: [snowstem.stem(word) for word in l])
t1= time.perf_counter() - t0
print('Time taken',t1 ,'sec')

In [None]:
print('Getting joiningwords ...')
t0= time.perf_counter()
phish_website_data['text_sent'] = phish_website_data['text_stemmed'].map(lambda l: ' '.join(l))
t1= time.perf_counter() - t0
print('Time taken',t1 ,'sec')

In [None]:
phish_website_data.sample(5)

In [None]:
bad_sites = phish_website_data[phish_website_data.Label == 'bad']
good_sites = phish_website_data[phish_website_data.Label == 'good']

In [None]:
good_sites.head()

In [None]:
bad_sites.head()

In [None]:
cv = CountVectorizer()

In [None]:
#Feature
ffeat = cv.fit_transform(phish_website_data.text_sent)

In [None]:
ffeat[:5].toarray()

In [None]:
trainX, testX, trainY, testY = train_test_split(ffeat, phish_website_data.Label)

In [None]:
lr = LogisticRegression()

In [None]:
lr.fit(trainX,trainY)

In [None]:
lr.score(testX,testY)

In [None]:
Scores_ml = {}
Scores_ml['Logistic Regression'] = np.round(lr.score(testX,testY),2)

In [None]:
print('Training Accuracy :',lr.score(trainX,trainY))
print('Testing Accuracy :',lr.score(testX,testY))
con_mat = pd.DataFrame(confusion_matrix(lr.predict(testX), testY),
            columns = ['Predicted:Bad', 'Predicted:Good'],
            index = ['Actual:Bad', 'Actual:Good'])


print('\nCLASSIFICATION REPORT\n')
print(classification_report(lr.predict(testX), testY,
                            target_names =['Bad','Good']))

print('\nCONFUSION MATRIX')
plt.figure(figsize= (6,4))
sbn.heatmap(con_mat, annot = True,fmt='d',cmap="YlGnBu")

In [None]:
mnb = MultinomialNB()

In [None]:
mnb.fit(trainX,trainY)

In [None]:
mnb.score(testX,testY)

In [None]:
Scores_ml['MultinomialNB'] = np.round(mnb.score(testX,testY),2)

In [None]:
print('Training Accuracy :',mnb.score(trainX,trainY))
print('Testing Accuracy :',mnb.score(testX,testY))
con_mat = pd.DataFrame(confusion_matrix(mnb.predict(testX), testY),
            columns = ['Predicted:Bad', 'Predicted:Good'],
            index = ['Actual:Bad', 'Actual:Good'])


print('\nCLASSIFICATION REPORT\n')
print(classification_report(mnb.predict(testX), testY,
                            target_names =['Bad','Good']))

print('\nCONFUSION MATRIX')
plt.figure(figsize= (6,4))
sbn.heatmap(con_mat, annot = True,fmt='d',cmap="YlGnBu")

In [None]:
acc = pd.DataFrame.from_dict(Scores_ml,orient = 'index',columns=['Accuracy'])
sbn.set_style('darkgrid')
sbn.barplot(acc.index,acc.Accuracy)

In [None]:
pipeline_ls = make_pipeline(CountVectorizer(tokenizer = RegexpTokenizer(r'[A-Za-z]+').tokenize,stop_words='english'), LogisticRegression())

In [None]:
trainX, testX, trainY, testY = train_test_split(phish_website_data.URL, phish_website_data.Label)

In [None]:
pipeline_ls.fit(trainX,trainY)

In [None]:
pipeline_ls.score(testX,testY) 

In [None]:
print('Training Accuracy :',pipeline_ls.score(trainX,trainY))
print('Testing Accuracy :',pipeline_ls.score(testX,testY))
con_mat = pd.DataFrame(confusion_matrix(pipeline_ls.predict(testX), testY),
            columns = ['Predicted:Bad', 'Predicted:Good'],
            index = ['Actual:Bad', 'Actual:Good'])


print('\nCLASSIFICATION REPORT\n')
print(classification_report(pipeline_ls.predict(testX), testY,
                            target_names =['Bad','Good']))

print('\nCONFUSION MATRIX')
plt.figure(figsize= (6,4))
sbn.heatmap(con_mat, annot = True,fmt='d',cmap="YlGnBu")

In [None]:
pickle.dump(pipeline_ls,open('phishing.pkl','wb'))

In [None]:
loaded_model = pickle.load(open('phishing.pkl', 'rb'))
result = loaded_model.score(testX,testY)
print(result)

In [None]:
predict_bad = ['marketplace.axieinfinity.com/']
predict_good = ['www.phishprotection.com/content/phishing-prevention/']
loaded_model = pickle.load(open('phishing.pkl', 'rb'))
#predict_bad = vectorizers.transform(predict_bad)
# predict_good = vectorizer.transform(predict_good)
result = loaded_model.predict(predict_bad)
result2 = loaded_model.predict(predict_good)
print('The 1st website is: ',result)
print()
print("="*50)
print()
print('The 2nd website is: ',result2)