In [3]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

### Load DNS events

In [4]:
data = pd.read_json('data/events.json', lines=True)

dns_events = data[data['event_type'] == 'dns']

print(f'{len(dns_events)} DNS events found')
dns_events.sample(2)

15749 DNS events found


Unnamed: 0,timestamp,flow_id,pcap_cnt,event_type,vlan,src_ip,src_port,dest_ip,dest_port,proto,...,vars,flow,icmp_type,icmp_code,tcp,smtp,email,app_proto_tc,app_proto_ts,stats
458878,2017-07-22T19:29:19.539650-0500,1227837000000000.0,3355823.0,dns,150.0,192.168.207.4,53.0,192.168.205.188,51035.0,UDP,...,,,,,,,,,,
123904,2017-07-22T18:29:17.844364-0500,436577000000000.0,505553.0,dns,150.0,192.168.205.188,45375.0,192.168.207.4,53.0,UDP,...,,,,,,,,,,


### Normalization

In [5]:
dns_df = pd.json_normalize(dns_events.to_dict(orient="records"), sep="_")

print(f'Dataset Shape: {dns_df.shape}')

Dataset Shape: (15749, 35)


### Filtering

In [6]:
dns_df = dns_df[dns_df['dns_rrtype'] == 'A']
unique_domains_df = dns_df.drop_duplicates(subset=['dns_rrname'])

print(f'Found {len(dns_df)} A records')
print(f'Found {dns_df["dns_rrname"].nunique()} unique domains')

Found 2849 A records
Found 177 unique domains


### TLDs

prompt: Write a Python function called `get_tld` that takes a domain name string as input and returns its effective Top-Level Domain (TLD).


In [20]:
def get_tld(domain):
    parts = domain.split('.')
    
    if len(parts) <= 2:
        return domain
    
    common_tlds = ['com', 'org', 'net', 'edu', 'gov', 'mil', 'io', 'ai', 'co']
    
    if parts[-1] in common_tlds:
        return '.'.join(parts[-2:])
    
    country_tlds = ['uk', 'au', 'ca', 'de', 'jp', 'fr', 'es', 'it', 'ru', 'cn', 'br', 'mx']
    
    if parts[-1] in country_tlds and len(parts) >= 3:
        if parts[-2] in ['co', 'com', 'org', 'net', 'ac', 'gov']:
            return '.'.join(parts[-3:])
    
    if len(parts) > 2 and parts[-1] not in common_tlds + country_tlds:
        return parts[-1]
    
    return '.'.join(parts[-2:])

In [21]:
unique_domains_df['domain_tld'] = unique_domains_df['dns_rrname'].apply(get_tld)

# A single list, with unique TLDs.
unique_tlds = pd.DataFrame(unique_domains_df['domain_tld'].unique(), columns=['domain_tld'])

print(f"Unique TLDs: {len(unique_tlds)}")

unique_tlds.to_csv('data/unique_tlds.csv', index=False)


Unique TLDs: 104
