In [1]:
import os
import pandas as pd
import whois
from tqdm import tqdm

### Load Classified TLDs

In [2]:
tld_classified_df = pd.read_csv("data/tld_dga_classification.csv")

### Load Whitelist

In [18]:
data = pd.read_csv("data/top_tld.csv", header=None)
whitelist = set(data[1].str.lower().tolist())

df = pd.DataFrame(whitelist)
df[0].tolist()


['waws-prod-blu-079.sip.azurewebsites.windows.net',
 'adhawk-marketplace-assets.s3-us-west-1.amazonaws.com',
 'auth.aliyundrive.com',
 'ibytedtos.com.www.tendawifi.com',
 'pus8-pods-eap.officeapps.live.com.wac-0003.wac-dc-msedge.net.wac-0003.wac-msedge.net',
 'p167-fmipmobile.icloud.com',
 'ringapigw.prod.gws.ring.amazon.dev',
 'rog-content-platform.asus.com.cn',
 'broker4-1.bue1.prod.zpath.net',
 'covoiturage-trieves.lan.pad.squareup.com',
 'snackvideo.com',
 'chancejs.com',
 'e5724.a.akamaiedge.net',
 'bbvaempresas.mx',
 'cn.hdlvcloud.ks-cdn.com',
 'luckylabs.io',
 'config-route.eu-1.smooch.io',
 'e6001.x.akamaiedge.net',
 'artifacts.opensearch.org',
 'onebarilla.service.signalr.net',
 'www.satelliteviewmaps.net',
 'customerservice2.southerncompany.com',
 'tts.api.citi.com',
 'r2---sn-bg5oqxjvh-50nz.gvt1.com',
 'earthsky.org',
 'member.neteller.com',
 'rr4---sn-3u-bh2sl.googlevideo.com',
 'thirdparty-eu.heiway.gtmobs.equant.com',
 'www.remodelaholic.com',
 'gvessm.com',
 'www.cryptop

### Domain Expert

prompt: Write a Python function called in_whitelist that takes a TLD string and a whitelist collection as parameters. The function should return 0 if the lowercase TLD exists as a substring in any lowercase domain in the whitelist, and 1 otherwise. 

In [35]:
def in_whitelist(tld, whitelist):
    """Returns 0 if the TLD is in whitelist, 1 if suspicious"""
    tld = tld.lower()
    
    return 0 if any(tld in domain.lower() for domain in whitelist) else 1


# This are the domains classified as malicious by Gemini
dga_df = tld_classified_df[tld_classified_df['dga_classification'] == 1].copy()


# This tells us if the TLD is in the whitelist
dga_df['whitelist_match'] = dga_df['domain_tld'].apply(lambda x: in_whitelist(x, whitelist))

dga_df


Unnamed: 0,domain_tld,dga_classification,whitelist_match
10,110phpmyadmin,1,1
34,wpad,1,0
35,lan,1,0
36,"56""",1,1
40,malwarecity.com,1,1
80,saruman,1,1
81,1201,1,0
96,vtlfccmfxlkgifuf.com,1,1
98,254,1,0
99,ejfodfmfxlkgifuf.xyz,1,1


In [36]:
# We want the domains that are not in the whitelist, and are classified as malicious,
# the intersection between the two sets is the set of suspicious TLDs.
highly_suspicious_tlds_df = dga_df[dga_df['whitelist_match'] == 1]
tlds = highly_suspicious_tlds_df.drop_duplicates(subset=['domain_tld'])

tlds

Unnamed: 0,domain_tld,dga_classification,whitelist_match
10,110phpmyadmin,1,1
36,"56""",1,1
40,malwarecity.com,1,1
80,saruman,1,1
96,vtlfccmfxlkgifuf.com,1,1
99,ejfodfmfxlkgifuf.xyz,1,1
100,201:,1,1


### Creation Date


prompt: Write a Python function called get_creation_date that takes a TLD string as input and returns its creation date.

In [38]:
def get_creation_date(tld):
    try:
        domain_info = whois.whois(tld)
        creation_date = domain_info.creation_date
        if isinstance(creation_date, list):
            creation_date = creation_date[0]
        return creation_date
    except Exception as e:
        print(f'Could not get creation date for {tld}')
        return None
    
tqdm.pandas(desc="Getting creation dates")
tlds['creation_date'] = tlds['domain_tld'].progress_apply(get_creation_date)

print("\nFinal TLDs, their classification and creation date:")
print(tlds[['domain_tld', 'dga_classification', 'whitelist_match', 'creation_date']])


Getting creation dates: 100%|██████████| 7/7 [00:00<00:00, 25376.08it/s]

Could not get creation date for 110phpmyadmin
Could not get creation date for 56"
Could not get creation date for malwarecity.com
Could not get creation date for saruman
Could not get creation date for vtlfccmfxlkgifuf.com
Could not get creation date for ejfodfmfxlkgifuf.xyz
Could not get creation date for 201:

Final TLDs, their classification and creation date:
               domain_tld  dga_classification  whitelist_match creation_date
10          110phpmyadmin                   1                1          None
36                    56"                   1                1          None
40        malwarecity.com                   1                1          None
80                saruman                   1                1          None
96   vtlfccmfxlkgifuf.com                   1                1          None
99   ejfodfmfxlkgifuf.xyz                   1                1          None
100                  201:                   1                1          None



