### **Malicious Url Detector**

## Importing Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Creating Data Frame

In [None]:
dataframe = pd.read_csv("Malicious_Url.csv")

# Cleansing of Data 

***Importing Packages form cleansing***

In [None]:
import re 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


***Cleaning Data***

In [None]:
corpus = [] # where I will store the cleaned data

for i in range(0, len(dataframe)):
  review = re.sub('[^a-zA-Z]', ' ', dataframe['url'][i])
  review = review.lower()
  review = review.split()
  #Stemming the data
  stemmer = PorterStemmer()
  review = [stemmer.stem(word) for word in review if not word in set(stopwords.words('english'))]
  review = ' '.join(review)
  corpus.append(review)

In [None]:
print(corpus)

***Creating the matrix of features*** and ***Dependent Variables***

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
x = cv.fit_transform(corpus).toarray()

In [None]:
print("matrix of features\n",x)

matrix of features
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


# Encoding Categorical Data

***Encoding Dependent Variables***

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = dataframe.iloc[:, -1].values
y = le.fit_transform(y)

In [None]:
print("Features\n", y)
# 1 - Malicious
# 0 - Benign

Features
 [1 0 0 1 1 0 0 0 1 0 0 1 0 1 1 1 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0
 0 1 1 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 1 0 0 0 1 0
 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 1 1 1 0 0 0 0
 1 0 0 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 1 1
 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 1 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 1 0 1 0 1 1 0 0
 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0
 0 1 1 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0
 0 0 0 1 1 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1
 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 1 0 1
 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 1 1 0 0 0 0 1 0 0
 0 0 0 0 1 0 0 

# Splitting of Data for Training and Testing Purpose

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

# Training the Model

## ***1. Logistic Regression***

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(x_train, y_train)

LogisticRegression()

# Creating Confusion Matrix and finding the Accuracy

In [None]:
y_pred = classifier.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

In [None]:
print("Confusion Matrix\n", cm)
print("Accuracy ", accuracy)
print("F1- score ", f1_score(y_test, y_pred))

Confusion Matrix
 [[139   2]
 [ 19  40]]
Accuracy  0.895
F1- score  0.7920792079207921


# Testing Against Custom Inputs

In [None]:
new = ["google.com/search=jcharistech",
       "google.com/search=faizanahmad",
       "pakistanifacebookforever.com/getpassword.php/",
       "www.radsport-voggel.de/wp-admin/includes/log.exe",
       "ahrenhei.without-transfer.ru/nethost.exe ",
       "www.itidea.it/centroesteticosothys/img/_notes/gum.exe"]

In [None]:
new = cv.transform(new).toarray()
custom_predict = classifier.predict(new)
print(custom_predict)

[0 0 0 1 1 1]
