In [1]:
"""
Notebook demonstraing a classifier of domains into 'msm' or 'fake' based on WHOIS info.

requires the following external modules: scikit-learn, numpy, pythonwhois

Also requires the pickle file (available in this repository): "enriched_whois_features.p"

For more details about constructing feature vectors and evaluating the classifier, see the other notebook: 
"Evaluating Domain Classification with Whois"

"""

import pythonwhois
from collections import defaultdict
from dateutil.parser import parse
import numpy as np
from sklearn import linear_model
import pickle, datetime

In [2]:
#function for getting fully-prefixed string attributes from the json data returned by the pythonwhois library
def get_kv_strings(kprefix, d):
    result = []
    if hasattr(d, "keys"):
        for k, v in d.items():
            if k =="raw":
                continue
            else:
                result.extend(get_kv_strings(kprefix+ "/" +k, v))
    elif isinstance(d, list):
        for e in d:
            result.append(kprefix + " : " + str(e))
    else:
        result.append(kprefix + " : " + str(d))
    return result

In [3]:
#Enriching the features vector by adding info about whether the domain is older than 2,5,10 years. Also add information
#about the day of week and hour of dat when the domain was created/updated.

days_nums_strings = {0:"Monday", 1:"Tuesday", 2:"Wednesday", 3:"Thursday",4:"Friday", 5:"Saturday", 6:"Sunday"}

def enrich_vector(fv):
    newv = ["/domain_name: " + fv[0][0].lower().strip()]  #adding the domain name as an additional feature, because it is not there
    dateval = ""
    for val in fv[1:]:
        val = val.lower()
        if val.startswith("/creation_date"):
            dateval = parse(val[val.index(": ")+2:])
            year = int(dateval.year)
            newv.append("/creation_year: " + str(year))
            newv.append("/domain older than 2 years: " + str(year<2016))
            newv.append("/domain older than 5 years: " + str(year<2013))
            newv.append("/domain older than 10 years: " + str(year<2008))
            weekday = dateval.weekday()
            newv.append("/creation weekday: " + days_nums_strings[weekday])
            newv.append("/created on weekend: " + str(weekday in [5,6]))
            hour = dateval.hour
            newv.append("/created outside typical business hours: " + str((hour>18 or hour <8)))
            #print("creation date: ", dateval)
            
        if val.startswith("/updated_date"):
            dateval = parse(val[val.index(": ")+2:])
            weekday = dateval.weekday()
            newv.append("/update weekday: " + days_nums_strings[weekday])
            newv.append("/updated on weekend: " + str(weekday in [5,6]))
            hour = dateval.hour
            newv.append("/updated outside typical business hours: " + str((hour>18 or hour <8)))
            #print("update date: ", dateval)
            
        if val.startswith("/nameservers"):
            nameserver = val[val.index(": ")+2:]
            nameserver_domain = nameserver.split(".")
            nameserver_domain = ".".join(nameserver_domain[-2:])
            newv.append("/nameserver_domain: "+ nameserver_domain.upper())
            
        if val.find("@")>-1:
            #look at the email domain
            email_domain = val[val.index("@")+1:]
            newv.append("/email_domain: "+ email_domain)
            
    newv = sorted(set(newv))  #get rid of duplicates, especially from nameserver normalization
    #print("newv:\n", "\n".join(newv))
    ret = [x for x in fv]
    ret.extend(newv)
    return ret

In [4]:
def select_dimensions(instances):
    features = defaultdict(lambda: 0)
    for current_instance in instances:
        #print("curret instance: ", current_instance[0:2])
        for d in set(current_instance[1:]):
            features[d] +=1
            
    print("\n\nNumber of instances: ", len(instances))
    print("Total number of possible features: ", len(features))
    selected_features = sorted(features.keys())
    return selected_features



In [5]:
def insts_to_binary_vecs(insts, dimensions):
    vecs = np.zeros(shape = [len(insts), len(dimensions)], dtype='uint')
    for i, inst in enumerate(insts):
        #print("building vector for instance:", inst[0])
        vec = np.array([0 if not d in inst[1:] else 1 for d in dimensions])
        vecs[i] = (vec)
    return vecs 


In [6]:

def create_enriched_vec_for_domain(dimensions, domain_name, label):
    w =  pythonwhois.get_whois(domain_name)
    w_iana = pythonwhois.get_whois(domain_name.strip(), server='whois.iana.org')
    vec = []
    vec.extend(get_kv_strings("", w))
    vec.extend(get_kv_strings("", w_iana))
    vec = sorted(set(vec))
    vec.insert(0, (domain_name, label))
    enriched_v = enrich_vector(vec)
    bvec = insts_to_binary_vecs([enriched_v], dimensions)
    return bvec
    

In [7]:

#get a trained classifier starting from a list of enriched vectors:
def get_trained_model(instances):
    dimensions = select_dimensions(instances)
    binary_vecs = insts_to_binary_vecs(instances, dimensions)
    domain_names = [x[0] for x in instances]
    instance_vectors = []
    for domain_name, bvec in zip(domain_names, binary_vecs):
        inst = [domain_name]
        inst.extend(bvec)
        instance_vectors.append(inst)
        
    y_train = [inst[0][1] for inst in instance_vectors]
    x_train = [inst[1:] for inst in instance_vectors]
    m = linear_model.LogisticRegression(C=1e5)
    t1 = datetime.datetime.now()
    m.fit(x_train, y_train)
    t2 = datetime.datetime.now()
    print("Training the model finished in:", t2-t1)
    return m, dimensions


In [8]:
#take a list of new labeled domains and add that to the training
def update_training(dimensions, existing_instances, list_labeled_domains):
    new_instances = []
    for i in list_labeled_domains:
        if not len(i) == 2 or not i[1] in {"fake", "msm"}:
            print("Error: each instance must be a pair of a domain name followed by the label 'fake' or 'msm'.")
        new_instances.append(create_enriched_vec_for_domain(dimensions, i[0], i[1]))
    existing_instances.extend(new_instances)
    updated_classifier = get_trained_model(existing_instances)[0]
    return updated_classifier



In [9]:
def get_prediction(model, dimensions, domain_name):
    bvec = create_enriched_vec_for_domain(dimensions, domain_name, "")
    pred = model.predict(bvec)
    return pred

In [10]:
#loading the training instances from the pickle file:
instances = pickle.load(open("enriched_whois_features.p", "rb"))
print("Training loaded. Number of training instances: ", len(instances))

Training loaded. Number of training instances:  700


In [12]:
#Examining some training instances.
instances[305]

[('cnn.com', 'msm'),
 '/contacts/admin : None',
 '/contacts/billing : None',
 '/contacts/registrant : None',
 '/contacts/tech : None',
 '/creation_date : 1993-09-22 04:00:00',
 '/emails : domainabuse@cscglobal.com',
 '/expiration_date : 2018-09-21 04:00:00',
 '/id : 3269879_DOMAIN_COM-VRSN',
 '/nameservers : NS-1086.AWSDNS-07.ORG',
 '/nameservers : NS-1630.AWSDNS-11.CO.UK',
 '/nameservers : NS-47.AWSDNS-05.COM',
 '/nameservers : NS-576.AWSDNS-08.NET',
 '/registrar : CSC Corporate Domains, Inc.',
 '/status : clientTransferProhibited https://icann.org/epp#clientTransferProhibited',
 '/status : serverDeleteProhibited https://icann.org/epp#serverDeleteProhibited',
 '/status : serverTransferProhibited https://icann.org/epp#serverTransferProhibited',
 '/status : serverUpdateProhibited https://icann.org/epp#serverUpdateProhibited',
 '/updated_date : 2017-02-15 17:31:58',
 '/whois_server : whois.corporatedomains.com',
 '/created on weekend: False',
 '/created outside typical business hours: Tr

In [13]:
#trainging the classifier:
classifier, dimensions = get_trained_model(instances)



Number of instances:  700
Total number of possible features:  5494
Training the model finished in: 0:00:00.658149


In [14]:
#example of how to get some prediction from the classifier
print(get_prediction(classifier, dimensions, "usatoday.com"))


['msm']


In [15]:
print(get_prediction(classifier, dimensions, "worldtruth.tv"))

['fake']


In [16]:
#Example for how to add new training instances:

new_instances  = [("ladiesofliberty.net", "fake"), ("theatlantic.com", "msm")]
classifier = update_training(dimensions, instances, new_instances)



Number of instances:  702
Total number of possible features:  5494


  """


Training the model finished in: 0:00:00.637599
