<a href="https://colab.research.google.com/github/bohsiang/URL_detection/blob/master/URL_weight_define.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')
# login and enter the Authorization code
# 登入並且輸入授權碼

In [0]:
import pandas as pd
DATA_FILE ='/content/drive/My Drive/URL.csv' 
data_csv = pd.read_csv(DATA_FILE, header=0)

## getting data

In [0]:
all_data = data_csv.loc[0:, ['URL', 'Label']] 

In [0]:
all_data['Label'] = [1 if i == "Malicious" else -1 for i in all_data['Label']]

In [0]:
Malicious_url = all_data.loc[all_data['Label'] == 1]

In [0]:
Benign_url = all_data.loc[all_data['Label'] == -1]

## split domain and path

In [0]:
def split_function(url):
  temp_domain = [i.split("/",3)[2] for i in url['URL']]
  temp_path = [i.split("/",3)[3] if len(i.split("/",3)) > 3 else "" for i in url['URL']]
  return temp_domain,temp_path

In [0]:
Benign_url_domain,Benign_url_path = split_function(Benign_url)

In [0]:
Malicious_url_domain,Malicious_url_path = split_function(Malicious_url)

## Find character in each words

In [0]:
import collections 
def char_arrangement(input_sentence):
  cnt = collections.Counter()
  for word in input_sentence:
    for character in word:
      cnt[character] += 1
  return cnt

In [0]:
Malicious_url_domain_count = char_arrangement(Malicious_url_domain)
Malicious_url_path_count = char_arrangement(Malicious_url_path)

In [0]:
Benign_url_domain_count = char_arrangement(Benign_url_domain)
Benign_url_path_count = char_arrangement(Benign_url_path)

## Draw Character Distribution

In [0]:
import numpy as np
import matplotlib.pyplot as plt

def Character_Distribution(input_data):
  labels = [i for i,j in input_data]
  values = [j for i,j in input_data]
  #labels, values = zip(*input_data.items())
  print(len(labels))
  indexes = np.arange(len(labels))
  width = 1

  plt.figure(figsize=(20,10))
  plt.bar(indexes, values, width)
  plt.xticks(indexes + width * 0.5, labels)
  plt.show()

  return labels

In [0]:
Malicious_domain_labels = Character_Distribution(Malicious_url_domain_count.most_common())

In [0]:
Benign_domain_labels = Character_Distribution(Benign_url_domain_count.most_common())

In [0]:
Malicious_path_labels = Character_Distribution(Malicious_url_path_count.most_common())

In [0]:
Benign_path_labels = Character_Distribution(Benign_url_path_count.most_common())

## Define New Dictionary

In [0]:
def new_dict(Benign,Malicious):
  max_num = max(len(Benign),len(Malicious))
  Malicious_array_num = [i for i in range(max_num,max_num-len(Malicious),-1)]
  Benign_array_num = [i for i in range(-max_num,-(max_num-len(Benign)),1)]
  Benign_dict = zip(Benign,Benign_array_num)
  Malicious_dict = zip(Malicious,Malicious_array_num)
  Benign_dict = dict(Benign_dict)
  Malicious_dict = dict(Malicious_dict)
  all_dict = {**Benign_dict, **Malicious_dict}

  for key in all_dict: 
    if key in Benign_dict and key in Malicious_dict: 
        all_dict[key] = Malicious_dict[key] + Benign_dict[key] 
    elif key in Benign_dict: 
        all_dict[key] = Benign_dict[key]
    elif key in Malicious_dict: 
        all_dict[key] = Malicious_dict[key] 
  return all_dict

In [0]:
domain_weight_dict = new_dict(Benign_domain_labels,Malicious_domain_labels)

In [0]:
path_weight_dict = new_dict(Benign_path_labels,Malicious_path_labels)

In [0]:
len(domain_weight_dict)

## Renew Define URL
### Add Penalty factor


In [0]:
def compute_weight(input_seq,dict_type):
  temp_arr = []
  for word in input_seq:
    num = 0
    for character in word:
      num += dict_type[character]
    if num == 0 :
      temp_arr.append(0)  
    else:
      #temp_arr.append(num/len(word))
      temp_arr.append(num)
  return temp_arr

In [0]:
def compute_url(domain,path):
  domain_arr = compute_weight(domain,domain_weight_dict)
  path_arr = compute_weight(path,path_weight_dict)
  all_arr = [i+j for i,j in zip(domain_arr,path_arr)]
  return all_arr

In [0]:
a = compute_url(Malicious_url_domain,Malicious_url_path)

In [0]:
sum(a)/len(a)

In [0]:
b = compute_url(Benign_url_domain,Benign_url_path)

In [0]:
sum(b)/len(b)

## Analysis Distribution
### overlapping two class
