In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import pickle
import time
import warnings
warnings.filterwarnings('ignore')

# Load the dataset0
phish_data = pd.read_csv('phishing_site_urls.csv', encoding='latin1')

# Check dataset info
print(phish_data.info())
print(phish_data.isnull().sum())

# Handle NaN values
phish_data.fillna('', inplace=True)

# Visualize the label counts
label_counts = pd.DataFrame(phish_data.Label.value_counts()).reset_index()
label_counts.columns = ['Label', 'Count']
sns.set_style('darkgrid')
sns.barplot(x='Label', y='Count', data=label_counts)
plt.show()

# Tokenize URLs
tokenizer = RegexpTokenizer(r'[A-Za-z]+')
print('Getting words tokenized ...')
t0 = time.perf_counter()
phish_data['text_tokenized'] = phish_data.URL.map(lambda t: tokenizer.tokenize(t))
t1 = time.perf_counter() - t0
print('Time taken', t1, 'sec')
print(phish_data.sample(5))

# Stem tokenized words
stemmer = SnowballStemmer("english")
print('Getting words stemmed ...')
t0 = time.perf_counter()
phish_data['text_stemmed'] = phish_data['text_tokenized'].map(lambda l: [stemmer.stem(word) for word in l])
t1 = time.perf_counter() - t0
print('Time taken', t1, 'sec')
print(phish_data.sample(5))

# Split data into bad and good sites
bad_sites = phish_data[phish_data.Label == 'bad']
good_sites = phish_data[phish_data.Label == 'good']


# Feature extraction
cv = CountVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x)
feature = cv.fit_transform(phish_data.text_stemmed)

# Split data
trainX, testX, trainY, testY = train_test_split(feature, phish_data.Label, test_size=0.3, random_state=42)

# Logistic Regression
lr = LogisticRegression()
lr.fit(trainX, trainY)
print('Training Accuracy:', lr.score(trainX, trainY))
print('Testing Accuracy:', lr.score(testX, testY))

con_mat = pd.DataFrame(confusion_matrix(lr.predict(testX), testY), 
                         columns=['Predicted:Class0', 'Predicted:Class1', 'Predicted:Class2'], 
                         index=['Actual:Class0', 'Actual:Class1', 'Actual:Class2'])
print('\nCLASSIFICATION REPORT\n', classification_report(lr.predict(testX), testY, target_names=['Class0', 'Class1', 'Class2']))
print('\nCONFUSION MATRIX')
sns.heatmap(con_mat, annot=True, fmt='d', cmap="YlGnBu")
plt.show()

# Multinomial Naive Bayes
mnb = MultinomialNB()
mnb.fit(trainX, trainY)
print('Training Accuracy:', mnb.score(trainX, trainY))
print('Testing Accuracy:', mnb.score(testX, testY))

con_mat = pd.DataFrame(confusion_matrix(lr.predict(testX), testY), 
                         columns=['Predicted:Class0', 'Predicted:Class1', 'Predicted:Class2'], 
                         index=['Actual:Class0', 'Actual:Class1', 'Actual:Class2'])
print('\nCLASSIFICATION REPORT\n', classification_report(mnb.predict(testX), testY, target_names=['Class0', 'Class1', 'Class2']))
print('\nCONFUSION MATRIX')
sns.heatmap(con_mat, annot=True, fmt='d', cmap="YlGnBu")
plt.show()

# Pipeline with Logistic Regression
pipeline_ls = make_pipeline(CountVectorizer(tokenizer=RegexpTokenizer(r'[A-Za-z]+').tokenize, stop_words='english'), LogisticRegression())
trainX, testX, trainY, testY = train_test_split(phish_data.URL, phish_data.Label, test_size=0.3, random_state=42)
pipeline_ls.fit(trainX, trainY)
print('Training Accuracy:', pipeline_ls.score(trainX, trainY))
print('Testing Accuracy:', pipeline_ls.score(testX, testY))
# Create a pipeline with CountVectorizer and LogisticRegression
pipeline_lr = make_pipeline(CountVectorizer(tokenizer=RegexpTokenizer(r'[A-Za-z]+').tokenize, stop_words='english'), LogisticRegression())

# Fit the pipeline to the training data
pipeline_lr.fit(trainX, trainY)

# Predict on the test data
y_pred = pipeline_lr.predict(testX)

# Create a confusion matrix
con_mat = pd.DataFrame(confusion_matrix(y_pred, testY), columns=['Predicted:Class0', 'Predicted:Class1', 'Predicted:Class2'], index=['Actual:Class0', 'Actual:Class1', 'Actual:Class2'])
print('\nCONFUSION MATRIX')
sns.heatmap(con_mat, annot=True, fmt='d', cmap="YlGnBu")
plt.show()

# Save model
pickle.dump(pipeline_ls, open('phishing.pkl', 'wb'))

# Load and test the model
loaded_model = pickle.load(open('phishing.pkl', 'rb'))
test_accuracy = loaded_model.score(testX, testY)
print('Loaded model test accuracy:', test_accuracy)

# Predict new samples
predict_bad = ['yeniik.com.tr/wp-admin/js/login.alibaba.com/login.jsp.php', 'fazan-pacir.rs/temp/libraries/ipad', 'tubemoviez.exe', 'svision-online.de/mgfi/administrator/components/com_babackup/classes/fx29id1.txt']
predict_good = ['umkc.edu/history/']
print('Prediction for bad links:', loaded_model.predict(predict_bad))
print('Prediction for good links:', loaded_model.predict(predict_good))