In [1]:
import datetime
import pandas as pd
import whois
import multiprocessing
import csv
import time

In [2]:
def transform_class(x):
    return 1 if x == 'good' else 0


def perform_whois(url):
    try:
        return whois.whois(url)
    except Exception:
        return False


def get_registered_date_in_days(whois_result):
    if whois_result:
        created_date = whois_result.creation_date
        if (created_date is not None) and (type(created_date) != str):
            if type(created_date) == list:
                created_date = created_date[0]
                if type(created_date) == str:
                    return -1
            today_date = datetime.datetime.now()
            days = (today_date - created_date).days
            return days
        else:
            return -1
    else:
        return -1


def get_expiration_date_in_days(whois_result):
    if whois_result:
        expiration_date = whois_result.expiration_date
        if (expiration_date is not None) and (type(expiration_date) != str):
            if type(expiration_date) == list:
                expiration_date = expiration_date[0]
            today_date = datetime.datetime.now()
            days = (expiration_date - today_date).days
            return days
        else:
            return -1
    else:
        return -1


def get_updated_date_in_days(whois_result):
    if whois_result:
        updated_date = whois_result.updated_date
        if (updated_date is not None) and (type(updated_date) != str):
            if type(updated_date) == list:
                updated_date = updated_date[0]
            today_date = datetime.datetime.now()
            days = (today_date-updated_date).days
            return days
        else:
            return -1
    else:
        return -1


def return_whois_data(urls, return_dict, thread_id):
    cou = 0
    for url in urls:
        whois_result = perform_whois(url)
        rd = get_registered_date_in_days(whois_result)
        ed = get_expiration_date_in_days(whois_result)
        ud = get_updated_date_in_days(whois_result)
        return_dict[url] = (rd, ed, ud)
        print("ID: " + str(thread_id) + " - " + str(cou))
        cou += 1


def chunks(data_list, n):
    return [data_list[i:i+n] for i in range(0, len(data_list), n)]

In [None]:
file = 'data.csv'
df = pd.read_csv(file, converters={'label': transform_class})
df.drop_duplicates(subset=None, inplace=True)
x = df.values[:, 0]
old_time = time.time()

# c = 0
print(multiprocessing.cpu_count())
slices = chunks(x, len(x) // multiprocessing.cpu_count())
manager = multiprocessing.Manager()
return_dict = manager.dict()
return_dict['url'] = ('rd', 'ed', 'ud')
jobs = []
for i, s in enumerate(slices):
    j = multiprocessing.Process(target=return_whois_data, args=(s, return_dict, i))
    jobs.append(j)
    j.start()

for j in jobs:
    j.join()

print(return_dict)
with open('whois_data_tmp.csv', 'a', encoding='utf-8') as file:
    writer = csv.writer(file)
    for k, v in return_dict.items():
        writer.writerow([k, v[0], v[1], v[2]])

new_time = time.time()
print(new_time - old_time)
print("Done")

12
