In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
file_path = '/content/drive/MyDrive/Adi-Cert-4.2/http.csv'
chunk_size = 10000
for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    malicious_http = chunk
    break

malicious_http.head()

In [None]:
content = malicious_http['content']
content.head()

In [None]:
url_df = malicious_http['url']
url_df.head()

In [None]:
import urllib.parse

def tokenize_url(url):
    parsed = urllib.parse.urlparse(url)
    path_tokens = parsed.path.strip('/').split('/')
    netloc_tokens = parsed.netloc.split('.')
    return path_tokens + netloc_tokens

In [None]:
malicious_http['tokenized_url'] = malicious_http['url'].apply(tokenize_url)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([' '.join(tokens) for tokens in malicious_http['tokenized_url']])

In [None]:
original_index = malicious_http.index

In [None]:
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, dendrogram

distances = pdist(tfidf_matrix.toarray(), metric='euclidean')
linkage_matrix = linkage(distances, method='ward')

plt.figure(figsize=(10, 5))
dendrogram(linkage_matrix, orientation = 'top', labels=original_index, distance_sort = 'descending', show_leaf_counts = True)
plt.title('Dendrogram')
plt.xlabel('Index')
plt.ylabel('Distance')
plt.show()
# no use, dense

In [None]:
from sklearn.cluster import KMeans
kmeans= KMeans(n_clusters =5)
clusters= kmeans.fit_predict(tfidf_matrix)
malicious_http['cluster'] = pd.Series(clusters, index = original_index)
malicious_http['cluster']

In [None]:
# create a dict for the user and the cluster
user_cluster_dict = {}
for index, row in malicious_http.iterrows():
    user = row['user']
    cluster = row['cluster']
    if cluster not in user_cluster_dict:
        user_cluster_dict[cluster] = []
    if user not in user_cluster_dict[cluster]:
        user_cluster_dict[cluster].append(user)
print(user_cluster_dict)

In [None]:
# sort the keys of the dict
sorted_user_cluster_dict = dict(sorted(user_cluster_dict.items()))
print(sorted_user_cluster_dict)

In [None]:
# see if any cluster contains an unusual number of insiders
insider_data = pd.read_csv('/content/drive/MyDrive/Adi-Cert-4.2/insiders.csv')
insider_data.head()

In [None]:
user_insider = insider_data['user']
user_insider.head()

In [None]:
cluster_insider_count = {}
for cluster, users in sorted_user_cluster_dict.items():
    count = 0
    for user in users:
        if user in user_insider.values:
            count+=1
    cluster_insider_count[cluster] = count
print(cluster_insider_count)
# the number of insiders is more in cluster 1

In [None]:
# see what percent of users is insiders in each cluster
cluster_insider_percent = {}
for cluster, users in sorted_user_cluster_dict.items():
    total_users = len(users)
    insider_count = cluster_insider_count[cluster]
    percent = insider_count/total_users
    cluster_insider_percent[cluster] = percent
print(cluster_insider_percent)
# the percentage of users who is also an insider is very less, so cant use this to conclude