In [3]:
import os
import pandas as pd
import whois
from tqdm import tqdm

### Load Classified TLDs

In [5]:
tld_classified_df = pd.read_csv("data/tld_dga_classification.csv")

### Load Whitelist

In [8]:
data = pd.read_csv("data/top_tld.csv", header=None)
whitelist = set(data[1].str.lower().tolist())

### Domain Expert

In [24]:
def in_whitelist(tld, whitelist):
    """Returns 1 if the TLD is suspicious, 0 otherwise"""
    return True if tld.lower() in whitelist else False

# This are the domains classified as malicious by Gemini
dga_df = tld_classified_df[tld_classified_df['dga_classification'] == 1].copy()

# This tells us if the TLD is in the whitelist
dga_df['whitelist_match'] = dga_df['domain_tld'].apply(lambda x: in_whitelist(x, whitelist))

# We want the intersection of the domains classified as malicious by Gemini and the whitelist.
highly_suspicious_tlds_df = dga_df[dga_df['whitelist_match'] == False]
tlds = highly_suspicious_tlds_df.drop_duplicates(subset=['domain_tld'])


In [26]:
print(f'Final TLDs: {len(tlds)}')
tlds

Final TLDs: 11


Unnamed: 0,domain_tld,dga_classification,whitelist_match
10,110phpmyadmin,1,False
34,wpad,1,False
35,lan,1,False
36,"56""",1,False
40,malwarecity.com,1,False
80,saruman,1,False
81,1201,1,False
96,vtlfccmfxlkgifuf.com,1,False
98,254,1,False
99,ejfodfmfxlkgifuf.xyz,1,False


### Creation Date


In [28]:
def get_creation_date(tld):
    try:
        domain_info = whois.whois(tld)
        creation_date = domain_info.creation_date
        if isinstance(creation_date, list):
            creation_date = creation_date[0]
        return creation_date
    except Exception as e:
        print(f'Could not get creation date for {tld}')
        return None
    
tqdm.pandas(desc="Obteniendo fechas de creación") 
tlds['creation_date'] = tlds['domain_tld'].progress_apply(get_creation_date)

print("\nFinal TLDs, their classification and creation date:")
print(tlds[['domain_tld', 'dga_classification', 'whitelist_match', 'creation_date']])


Obteniendo fechas de creación: 100%|██████████| 11/11 [00:00<00:00, 37571.13it/s]

Could not get creation date for 110phpmyadmin
Could not get creation date for wpad
Could not get creation date for lan
Could not get creation date for 56"
Could not get creation date for malwarecity.com
Could not get creation date for saruman
Could not get creation date for 1201
Could not get creation date for vtlfccmfxlkgifuf.com
Could not get creation date for 254
Could not get creation date for ejfodfmfxlkgifuf.xyz
Could not get creation date for 201:

Final TLDs, their classification and creation date:
               domain_tld  dga_classification  whitelist_match creation_date
10          110phpmyadmin                   1            False          None
34                   wpad                   1            False          None
35                    lan                   1            False          None
36                    56"                   1            False          None
40        malwarecity.com                   1            False          None
80                saruman 


