In [2]:
# By Ismael Valenzuela (@aboutsecurity / @thinkredactblue)


In [1]:
# Begin by importing the pandas and numpy libraries, two common libraries used for manipulating data, and loading the DNS Analytics data into a pandas DataFrame. A Pandas DataFrame is simply a 2 dimensional data structure, like a 2 dimensional array, or a table with rows and columns of potentially different types. You can think of it like a spreadsheet or SQL table, or a dict of Series objects. It is generally the most commonly used pandas object.

import pandas as pd
import numpy as np
data = pd.read_csv('data.csv')

In [2]:
# Download Cisco Umbrella Top 1 Million file from http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip

top = pd.read_csv('top-1m.csv',names=["Num", "Name"])

In [4]:
data.describe()

Unnamed: 0,EventId,Result,Message,TaskCategory,ResultCode,MaliciousIP,IndicatorThreatType,Description,Confidence,Severity,ReportReferenceLink,RemoteIPLongitude,RemoteIPLatitude,RemoteIPCountry,_ResourceId
count,30000.0,0.0,0.0,0.0,30000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,257.0014,,,,0.0181,,,,,,,,,,
std,0.037391,,,,0.226217,,,,,,,,,,
min,257.0,,,,0.0,,,,,,,,,,
25%,257.0,,,,0.0,,,,,,,,,,
50%,257.0,,,,0.0,,,,,,,,,,
75%,257.0,,,,0.0,,,,,,,,,,
max,258.0,,,,3.0,,,,,,,,,,


In [5]:
top.describe()

Unnamed: 0,Num
count,1000000.0
mean,500000.5
std,288675.278932
min,1.0
25%,250000.75
50%,500000.5
75%,750000.25
max,1000000.0


In [6]:
# Start reducing the dataset by removing DNS queries to domains in Cisco Umbrella's Top 1 million dataset

reduced = (data[~data.Name.isin(top.Name)])

In [7]:
reduced.describe()

Unnamed: 0,EventId,Result,Message,TaskCategory,ResultCode,MaliciousIP,IndicatorThreatType,Description,Confidence,Severity,ReportReferenceLink,RemoteIPLongitude,RemoteIPLatitude,RemoteIPCountry,_ResourceId
count,9298.0,0.0,0.0,0.0,9298.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,257.00441,,,,0.008819,,,,,,,,,,
std,0.066261,,,,0.132523,,,,,,,,,,
min,257.0,,,,0.0,,,,,,,,,,
25%,257.0,,,,0.0,,,,,,,,,,
50%,257.0,,,,0.0,,,,,,,,,,
75%,257.0,,,,0.0,,,,,,,,,,
max,258.0,,,,2.0,,,,,,,,,,


In [8]:
# Continue reducing the dataset based on duplicated records

reduced = reduced.drop_duplicates(subset='Name', keep='first')
reduced.describe()

Unnamed: 0,EventId,Result,Message,TaskCategory,ResultCode,MaliciousIP,IndicatorThreatType,Description,Confidence,Severity,ReportReferenceLink,RemoteIPLongitude,RemoteIPLatitude,RemoteIPCountry,_ResourceId
count,298.0,0.0,0.0,0.0,298.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,257.006711,,,,0.013423,,,,,,,,,,
std,0.081785,,,,0.16357,,,,,,,,,,
min,257.0,,,,0.0,,,,,,,,,,
25%,257.0,,,,0.0,,,,,,,,,,
50%,257.0,,,,0.0,,,,,,,,,,
75%,257.0,,,,0.0,,,,,,,,,,
max,258.0,,,,2.0,,,,,,,,,,


In [10]:
# Extract coordinates from our reduced dataframe, uses IP-API to get latitude and, and plot them in a map

import folium
import json
import urllib.request

api = "http://ip-api.com/json/"
TIME_PERIOD = 60 

COORD_map = folium.Map(zoom_start=12, 
tiles='cartodbpositron', width=1280, height=1080)

def add_coordinates(domain,computername):
    response = urllib.request.urlopen(api + domain)
    data = response.read()
    value = json.loads(data)

    print (domain+","+str(value['lat'])+","+str(value['lon'])+","+value['status'])

    if (value['status']=='success'):
    
        html = "\
            <font size=1><h1>"+domain+"</h1><ul>\
                <font size=2><li>Country: "+value['country']+"</li>\
                    <li>City: "+value['city']+"</li>\
                        <li>ISP: "+value['isp']+"</li>\
                            <li>Org: "+value['org']+"</li>\
                                <li>Source: "+computername+"</li>\
                            </ul></font>"
        iframe = folium.IFrame(html=html, width=250, height=200)

        folium.Marker(
            location = [(value['lat']),(value['lon'])],
            popup = folium.Popup(iframe, max_width=2650),
            icon=folium.DivIcon(html=f"""
                <div><svg>
                    <circle cx="25" cy="25" r="20" fill="#69b3a2" opacity=".4"/>
                    <rect x="17", y="17" width="15" height="15", fill="red", opacity=".3" 
                </svg></div>""")
        ).add_to(COORD_map)

# Iterate over reduced dataframe

for i in range(10):
    add_coordinates(reduced.iloc[i].Name,reduced.iloc[i].Computer)

# Visualize map 

COORD_map
# COORD_map.save('map.html')

sftp3.dhl.com,50.0805,14.467,success
www.realtyprice.kr,37.536,126.971,success
taxpayerportalb.ird.gov.np,27.7142,85.3145,success
sso.deltacontrols.com,50.7033,-119.2683,success
emailweb.sktelecom.com,35.8723,128.5924,success
ebill.fnucni.co.kr,37.661,126.8324,success
ccl.pku.edu.cn,39.9907,116.313,success
watchtower.sharefile.com,47.6229,-122.337,success
mdpay.fibank.bg,42.6975,23.3242,success
cmedimage.ddns.net,-15.7797,-47.9297,success
