## Connecting H2O

In [None]:
import h2o
from IPython import get_ipython
import jupyter
import matplotlib.pyplot as plt
from pylab import rcParams
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

from h2o.estimators import H2OGradientBoostingEstimator   

h2o.init(max_mem_size = 2) # initializing h2o server
h2o.remove_all()

## Importing Data

In [None]:
csic = h2o.import_file("data/training.csv")

## Clean data set and calculate some values based on domains

1. String length
2. Shannon Entropy
3. Proportion of vowels

In [None]:
print('Data cleaning...')
domains = domains[~domains['subclass'].isna()]

print('Feature: string length')
domains['length'] = domains['domain'].nchar()

print('Feature: Shannon entropy')
domains['entropy'] = domains['domain'].entropy()

print('Feature: proportion of vowels')
domains['p_vowels'] = 0
for v in 'aeiou':
  domains['p_vowels'] += domains['domain'].countmatches(v)

domains['p_vowels'] /= domains['length']

print('Feature: count of substrings that are English words')
english_words = os.path.join(os.path.realpath(os.getcwd()),'words.txt')
domains['num_words'] = domains['domain'].num_valid_substrings(english_words)

# Check new values

In [None]:
print(domains)

# Add response field

In [None]:
print('\nResponse: Is domain malicious?')

domains['malicious'] = domains['class'] != 'legit'

print (domains)

# Split dataset

In [None]:
rand = domains.runif(seed=123456)
train = domains[rand <= 0.8]
valid = domains[rand > 0.8]

# Configure and train model

In [None]:
print('\nModel: Logistic regression with regularization')
model = H2OGradientBoostingEstimator(model_id='MaliciousDomainModel',
                                      family='binomial', alpha=0, Lambda=1e-5)

model.train(x=['length', 'entropy', 'p_vowels', 'num_words'],
            y='malicious', training_frame=train, validation_frame=valid)

print(model.confusion_matrix(valid=True))

In [None]:
path_dataset = os.path.join(os.path.realpath(os.getcwd()), 'legit-dga_domains_parsed.csv')

h2o.export_file(domains,path_dataset)