In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import collections
import math
import random
import pickle
from six.moves import xrange

In [2]:
def save2file(filename, data):
    pickle_out = open("C:/Users/Akarsh/Downloads/DP_scripts/store_emb/" + filename + ".pickle", "wb")
    pickle.dump(data, pickle_out, protocol=pickle.HIGHEST_PROTOCOL)
    pickle_out.close()

def loadfile(filename):
    pickle_in = open("C:/Users/Akarsh/Downloads/DP_scripts/store_emb/" + filename + ".pickle","rb")
    return pickle.load(pickle_in)

main_str = str(437)

filename = "ip2vec_train_" + main_str
ip2vec_train = loadfile(filename)
print(ip2vec_train.shape)

(260391, 8)


In [3]:
print("building dataset...")
num_elems = -1
def build_dataset(ip2vec_train):
    global num_elems # keep count of attributes for each flow
    data = list(filter(None, ip2vec_train.to_csv(header=False, index=False).splitlines() ))
    num_lines = len(data)
    num_elems = len(data[0].split(",")) # 8 cols
    
    res = [] #convert to list
    for line in data:
        for word in line.split(","):
            res.append(word.strip())
    count = []
    count.extend( collections.Counter(res).most_common() ) #count freq    
    vocab_size = len(count)
    
    w2v = dict()
    for word, _ in count:
        w2v[word] = len(w2v) #w2v
    v2w = dict(zip(w2v.values(), w2v.keys())) #v2w
    
    data = list()
    for word in res:
        if word in w2v:
            index = w2v[word]
        data.append(index) #convert to list
    return data, num_lines, w2v, v2w, vocab_size

data, num_lines, w2v, v2w, vocab_size = build_dataset(ip2vec_train)
print("vocab size: ", vocab_size)

del ip2vec_train

building dataset...
vocab size:  58863


In [4]:
# hyperparameters
batch_size = 128; embedding_size = 20; num_sampled = 32; num_epochs = 500;
data_index = 0; c_iter = 0; pairs = 13; training_pairs = pairs * batch_size; 
idx = []
for i in range(0, num_lines-1):
    idx.append(i)

def generate_batch():
    global data_index; global num_elems; global c_iter
    batch = np.ndarray(shape=(training_pairs),dtype=np.int32)
    labels= np.ndarray(shape=(training_pairs,1), dtype=np.int32)
    data_index = idx[c_iter] * num_elems

    for i in range(batch_size):
        # input SrcIP       
        batch[i*pairs+0] = data[data_index]; labels[i*pairs+0,0] = data[data_index+1]
        batch[i*pairs+1] = data[data_index]; labels[i*pairs+1,0] = data[data_index+2]
        batch[i*pairs+2] = data[data_index]; labels[i*pairs+2,0] = data[data_index+4]
        
        # input DstIP
        batch[i*pairs+3] = data[data_index+2]; labels[i*pairs+3,0] = data[data_index]
        batch[i*pairs+4] = data[data_index+2]; labels[i*pairs+4,0] = data[data_index+4]
        batch[i*pairs+5] = data[data_index+2]; labels[i*pairs+5,0] = data[data_index+3]

        # input srcPt
        batch[i*pairs+6] = data[data_index+1]; labels[i*pairs+6,0] = data[data_index+0]

        # input dstPt
        batch[i*pairs+7] = data[data_index+3]; labels[i*pairs+7,0] = data[data_index+2]

        # input dur
        batch[i*pairs+8] = data[data_index+7]; labels[i*pairs+8,0] = data[data_index+5]

        # input byt
        batch[i*pairs+9] = data[data_index+6]; labels[i*pairs+9,0] = data[data_index+5]
        batch[i*pairs+10] = data[data_index+6]; labels[i*pairs+10,0] = data[data_index+7]

        # input packets
        batch[i*pairs+11] = data[data_index+5]; labels[i*pairs+11,0] = data[data_index+6]
        batch[i*pairs+12] = data[data_index+5]; labels[i*pairs+12,0] = data[data_index+7]

        # Check if end of training list is reached
        c_iter += 1
        if c_iter == num_lines - 1:
            c_iter = 0
            random.shuffle(idx)
        data_index = idx[c_iter] * num_elems
    return batch, labels

In [5]:
print("building tensorflow graph...")
graph = tf.Graph()

with graph.as_default(): 
    train_inputs = tf.placeholder(tf.int32,shape=[training_pairs])
    train_labels = tf.placeholder(tf.int32,shape=[training_pairs,1])

    with tf.device('/cpu:0'): 
        embeddings = tf.Variable(tf.random_uniform([vocab_size,embedding_size],-1.0,1.0))
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)

        nce_weights = tf.Variable(tf.truncated_normal([vocab_size, embedding_size],stddev=1.0 / math.sqrt(embedding_size)))
        nce_biases = tf.Variable(tf.zeros([vocab_size]))
        loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights, biases=nce_biases, labels=train_labels, 
                                             inputs=embed, num_sampled=num_sampled, num_classes=vocab_size))

        optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
        
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
        normalized_embeddings = embeddings / norm
        init = tf.global_variables_initializer()

building tensorflow graph...
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [6]:
num_steps = int(num_lines / batch_size * num_epochs)
print("training steps: ", num_steps)

with tf.Session(graph=graph) as session: 
    init.run()
    average_loss = 0
    for step in xrange(num_steps): 
        batch_inputs, batch_labels = generate_batch()
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 20000 == 0: 
            if step > 0:
                average_loss /= 20000
            print("Average loss at step", step, ": ", average_loss, " from ", num_steps ," steps.")
            average_loss = 0
    print('training finished...', '\n')
    
    # Save embeddings
    to_save_n = session.run(embeddings) 
    to_save = (to_save_n - to_save_n.min(0)) / to_save_n.ptp(0) 
    to_save_norm = to_save / to_save.max(axis=0)
    
    final_emb = pd.DataFrame(data=to_save_norm[0:, 0:])
    vals = []
    for u in range(0,len(to_save)):
        vals.append(v2w.get(u))
    final_emb['values'] = vals
    print('saving embeddings...')
    
    filename = "ip2vec_emb_" + main_str
    save2file(filename, final_emb)

training steps:  1017152
Average loss at step 0 :  136.0756378173828  from  1017152  steps.
Average loss at step 20000 :  65.17678718366624  from  1017152  steps.
Average loss at step 40000 :  31.80580021824837  from  1017152  steps.
Average loss at step 60000 :  24.438414533948897  from  1017152  steps.
Average loss at step 80000 :  20.697956525397302  from  1017152  steps.
Average loss at step 100000 :  18.39078493566513  from  1017152  steps.
Average loss at step 120000 :  16.716911771726608  from  1017152  steps.
Average loss at step 140000 :  15.440608703804017  from  1017152  steps.
Average loss at step 160000 :  14.35392348074913  from  1017152  steps.
Average loss at step 180000 :  13.487041939735413  from  1017152  steps.
Average loss at step 200000 :  12.711474601078033  from  1017152  steps.
Average loss at step 220000 :  12.034106461453439  from  1017152  steps.
Average loss at step 240000 :  11.467714803314209  from  1017152  steps.
Average loss at step 260000 :  10.936181

In [7]:
filename = "ip2vec_emb_" + main_str
ip2vec_emb = loadfile(filename)

display(ip2vec_emb.head())
print(ip2vec_emb.shape)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,values
0,0.374719,0.242699,0.804956,0.23355,0.753552,0.161305,0.899202,0.471473,0.6394,0.082626,...,0.585408,0.393872,0.928697,0.461868,0.38963,0.020627,0.514851,0.879726,0.615675,TCP
1,0.690727,0.289209,0.624222,0.78716,0.259145,1.0,0.844552,0.869673,0.398222,0.751319,...,1.0,0.432327,0.184115,0.275347,0.72082,0.33834,0.552738,0.503326,0.304589,0.0_d
2,0.700514,0.424836,0.185812,0.616382,0.46956,0.492044,0.0,0.564201,0.792898,0.874603,...,0.329597,0.540815,0.640496,0.986693,0.467639,0.614137,0.588136,0.802519,0.292566,1_k
3,0.0,0.229812,0.72176,0.84771,0.42583,0.227527,0.530943,0.329691,0.724129,0.800538,...,0.757808,0.089902,0.220831,0.921574,0.741282,0.982664,0.848599,0.53569,0.36775,443_p
4,0.197396,0.0,0.483233,0.585403,0.172215,0.472147,0.425286,0.326074,0.365241,0.838361,...,0.956589,0.395134,0.096285,0.80928,0.605448,0.637512,1.0,0.906251,0.830612,80_p


(58863, 21)


In [8]:
save = True
if save:
    df = pd.DataFrame.from_records([{'operation': 'ip2vec_train', 'main_str': main_str, 'vocab_size': vocab_size,
                                     'batch_size': batch_size, 'embedding_size': embedding_size, 
                                     'negative_samp': num_sampled, 'lr': 0.05, 'num_epochs': num_epochs, 'final loss': average_loss}])
    
    df.to_csv("C:/Users/Akarsh/Downloads/DP_scripts/store_emb/store_params.csv", mode='a', index=False)
    print('data and hyperparams saved...')

data and hyperparams saved...
