In [1]:
import pandas
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
%matplotlib inline
import tensorflow as tf
import glob

In [2]:
import os
import os.path

In [3]:
validFilePaths = []
for f in os.listdir("data/anomaly_data"):
    filePath = os.path.join("data/anomaly_data", f)
    if os.path.isdir(filePath):
        continue
    if os.stat(filePath).st_size <= 3:
        continue
    validFilePaths.append(filePath)

df_list = (pandas.read_csv(f) for f in validFilePaths)
df = pandas.concat(df_list, ignore_index=True)
df = df[df['radiant_win'].notnull()]

In [4]:
df.shape
columns = df.columns
df_catInteger_features_example = filter(lambda x: 'hero_id' in x, columns)

In [5]:
from itertools import chain
# these will require string processing on the column names to work
numericalFeatures = ['match_id', 'positive_votes', 'negative_votes', 'first_blood_time', 'radiant_win',
                    'duration', 'kills', 'deaths', 'assists', 'apm', 'kpm', 'kda', 'hero_dmg',
                    'gpm', 'hero_heal', 'xpm', 'totalgold', 'totalxp', 'lasthits', 'denies',
                    'tower_kills', 'courier_kills', 'gold_spent', 'observer_uses', 'sentry_uses',
                    'ancient_kills', 'neutral_kills', 'camps_stacked', 'pings', 'rune_pickups']
categoricalIntegerFeatures = ['barracks_status', 'tower_status', 'hero_id', 
                              'item0', 'item1', 'item2', 'item3', 'item4', 'item5']
categoricalFullFeatures = ['patch']
numFeatures = [filter(lambda x: z in x, columns) for z in numericalFeatures]
categoricalIntegerFeatures  = [filter(lambda x: z in x, columns) for z in categoricalIntegerFeatures]
catFull = [filter(lambda x: z in x, columns) for z in categoricalFullFeatures]
numFeatures = list(chain(*numFeatures))
categoricalIntegerFeatures = list(chain(*categoricalIntegerFeatures))
catFull = list(chain(*catFull))

In [6]:
df_numerical = df[numFeatures].values
df_cat_num = df[categoricalIntegerFeatures]
df_cat = df[catFull]

#scipy sparse
vectorizer = DictVectorizer(sparse = True)
df_cat = vectorizer.fit_transform(df_cat.fillna('NA').to_dict(orient="records"))

#scipy sparse
enc = OneHotEncoder(sparse = True)
df_cat_num = enc.fit_transform(df_cat_num)

In [7]:
print type(df_numerical)
print type(df_cat_num)
print type(df_cat)

<type 'numpy.ndarray'>
<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>


In [None]:
# df = pandas.concat([df_numerical, df_cat, df_cat_num], ignore_index=True)

In [None]:
np.random.seed(1)
x = np.random.rand(len(df))
mask = x < 0.7
mask1 = np.logical_and(x >= 0.7, x < 0.9) 
mask2 = np.logical_and(x >= 0.9)

In [None]:
df_train = df[mask]
df_validation = df[mask1]
df_test = df[mask2]

In [None]:
NumFeatures = len(df.columns)

In [None]:
def construct(x, layer_size=[10, 10, NumFeatures], learning_rate=0.1):
    y = x
    #encoders
    weights_1 = tf.Variable(tf.random_normal([NumFeatures, layer_size[0]]))
    bias_1 = tf.Variable(tf.random_normal([layer_size[0]]))
    weights_2 = tf.Variable(tf.random_normal([layer_size[0], layer_size[1]]))
    bias_2 = tf.Variable(tf.random_normal([layer_size[1]]))
    
    #decoders
    weights_3 = tf.Variable(tf.random_normal([layer_size[1], layer_size[2]]))
    bias_3 = tf.Variable(tf.random_normal([layer_size[2]]))
    weights_4 = tf.Variable(tf.random_normal([layer_size[2], NumFeatures]))
    bias_4 = tf.Variable(tf.random_normal([NumFeatures]))
    
    layer1 = tf.nn.relu(tf.matmul(x, weights_1) + bias_1)
    layer2 = tf.nn.relu(tf.matmul(layer1, weights_2) + bias_2)
    layer3 = tf.nn.relu(tf.matmul(layer2, weights_3) + bias_3)
    output = tf.nn.relu(tf.matmul(layer3, weights_4) + bias_4)
    
    cost = tf.reduce_mean(tf.pow(y-output, 2))
    momentum = 0.5
    optimizer = tf.train.MomentumOptimizer(learning_rate, momentum).minimize(cost)
    
    init = tf.global_variables_initializer()
    
    return init, optimizer     

In [None]:
with tf.Session() as sess:
    x = tf.placeholder(tf.float32, [None, NumFeatures])
    init, optimizer = construct(x)
    sess.run(init)
    numEpochs = 1000
    numBatches = 1000
    batchSize = round(0.1 * len(df_train))
    for epochIter in xrange(numEpochs):
        for batchItr in xrange(numBatches):
            batch = batchGetter(df, batchSize)
            session.run(optimizer, feed_dict = {x : batch})