In [54]:
import pandas
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
%matplotlib inline
import tensorflow as tf
import glob

In [55]:
import os
import os.path

In [56]:
validFilePaths = []
for f in os.listdir("data/anomaly_data"):
    filePath = os.path.join("data/anomaly_data", f)
    if os.path.isdir(filePath):
        continue
    if os.stat(filePath).st_size <= 3:
        continue
    validFilePaths.append(filePath)
validFilePaths = np.random.choice(validFilePaths, 10, replace=False)
df_list = (pandas.read_csv(f) for f in validFilePaths)
df = pandas.concat(df_list, ignore_index=True)
df = df[df['radiant_win'].notnull()]

In [57]:
print df.shape
columns = df.columns
df_catInteger_features_example = filter(lambda x: 'hero_id' in x, columns)

(285, 331)


In [58]:
from itertools import chain
# these will require string processing on the column names to work
numericalFeatures = ['match_id', 'positive_votes', 'negative_votes', 'first_blood_time', 'radiant_win',
                    'duration', 'kills', 'deaths', 'assists', 'apm', 'kpm', 'kda', 'hero_dmg',
                    'gpm', 'hero_heal', 'xpm', 'totalgold', 'totalxp', 'lasthits', 'denies',
                    'tower_kills', 'courier_kills', 'gold_spent', 'observer_uses', 'sentry_uses',
                    'ancient_kills', 'neutral_kills', 'camps_stacked', 'pings', 'rune_pickups']
categoricalIntegerFeatures = ['barracks_status', 'tower_status', 'hero_id', 
                              'item0', 'item1', 'item2', 'item3', 'item4', 'item5']
categoricalFullFeatures = ['patch']
numFeatures = [filter(lambda x: z in x, columns) for z in numericalFeatures]
categoricalIntegerFeatures  = [filter(lambda x: z in x, columns) for z in categoricalIntegerFeatures]
catFull = [filter(lambda x: z in x, columns) for z in categoricalFullFeatures]
numFeatures = list(chain(*numFeatures))
categoricalIntegerFeatures = list(chain(*categoricalIntegerFeatures))
catFull = list(chain(*catFull))

In [61]:
df_numerical = df[numFeatures]
df_numerical.loc[:, 'radiant_win'] = df_numerical.loc[:, 'radiant_win'].apply(lambda x : int(x))
df_cat_num = df[categoricalIntegerFeatures]
df_cat = df[catFull]

#scipy sparse
vectorizer = DictVectorizer(sparse = True)
df_cat = vectorizer.fit_transform(df_cat.fillna('NA').to_dict(orient="records"))

#scipy sparse
enc = OneHotEncoder(sparse = True)
df_cat_num = enc.fit_transform(df_cat_num)

In [62]:
from scipy.sparse import coo_matrix, hstack
print type(df_numerical)
print type(df_cat_num)
print type(df_cat)
df_cat_num = coo_matrix(df_cat_num)
df_cat = coo_matrix(df_cat)
df = hstack([df_cat_num, df_numerical])

<class 'pandas.core.frame.DataFrame'>
<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>


In [None]:
# df = pandas.concat([df_numerical, df_cat, df_cat_num], ignore_index=True)

In [71]:
np.random.seed(1)
x = np.random.rand(df.shape[0])
mask = np.where(x < 0.7)[0]
mask1 = np.where(np.logical_and(x >= 0.7, x < 0.9))[0] 
mask2 = np.where(x >= 0.9)[0]

In [70]:
print np.where(mask)[0]

[  0   2   3   4   5   6   7   8   9  10  11  12  14  15  16  17  18  19
  22  23  26  27  28  30  31  33  34  35  36  38  42  44  45  47  48  49
  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67
  69  71  72  74  75  77  81  83  84  86  88  89  90  92  93  94  95  97
  98  99 100 101 103 105 106 108 110 111 113 114 119 120 121 122 123 125
 126 128 129 130 132 133 135 137 140 141 142 143 144 145 146 148 149 150
 152 153 154 156 157 160 161 162 164 165 166 167 168 169 170 172 173 174
 176 177 178 179 180 181 183 184 186 187 188 190 191 196 197 198 201 203
 204 205 206 207 210 212 213 214 215 216 219 220 221 222 224 226 230 231
 233 234 235 236 237 238 240 242 243 245 246 247 248 250 252 254 256 257
 258 259 263 265 266 267 268 269 270 271 272 273 275 276 277 278 282 283
 284]


In [73]:
df_train = df.tocsr()[mask, :]
df_validation = df.tocsr()[mask1, :]
df_test = df.tocsr()[mask2, :]

In [76]:
NumFeatures = df.shape[1]

In [77]:
def construct(x, layer_size=[10, 10, NumFeatures], learning_rate=0.1):
    y = x
    #encoders
    weights_1 = tf.Variable(tf.random_normal([NumFeatures, layer_size[0]]))
    bias_1 = tf.Variable(tf.random_normal([layer_size[0]]))
    weights_2 = tf.Variable(tf.random_normal([layer_size[0], layer_size[1]]))
    bias_2 = tf.Variable(tf.random_normal([layer_size[1]]))
    
    #decoders
    weights_3 = tf.Variable(tf.random_normal([layer_size[1], layer_size[2]]))
    bias_3 = tf.Variable(tf.random_normal([layer_size[2]]))
    weights_4 = tf.Variable(tf.random_normal([layer_size[2], NumFeatures]))
    bias_4 = tf.Variable(tf.random_normal([NumFeatures]))
    
    layer1 = tf.nn.relu(tf.matmul(x, weights_1) + bias_1)
    layer2 = tf.nn.relu(tf.matmul(layer1, weights_2) + bias_2)
    layer3 = tf.nn.relu(tf.matmul(layer2, weights_3) + bias_3)
    output = tf.nn.relu(tf.matmul(layer3, weights_4) + bias_4)
    
    cost = tf.reduce_mean(tf.pow(y-output, 2))
    momentum = 0.5
    optimizer = tf.train.MomentumOptimizer(learning_rate, momentum).minimize(cost)
    
    init = tf.global_variables_initializer()
    
    return init, optimizer     

In [None]:
with tf.Session() as sess:
    x = tf.placeholder(tf.float32, [None, NumFeatures])
    init, optimizer = construct(x)
    sess.run(init)
    numEpochs = 1000
    numBatches = 1000
    batchSize = int(round(0.1 * df_train.shape[0]))
    for epochIter in xrange(numEpochs):
        for batchItr in xrange(numBatches):
            indices = np.random.choice(range(df_train.shape[0]), batchSize, replace=False)
            batch = df_train[indices, :].toarray()
            sess.run(optimizer, feed_dict = {x : batch})