In [128]:
import tensorflow as tf
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Normalization
import itertools
from sklearn.metrics import confusion_matrix

In [130]:
length = 40
human_ratio = 0.8
bot_ratio = 0.2
def data_handler_VAR(data, max_length = length):
    d = np.zeros((len(data), max_length, 2))
    for line in range(len(data)):
        tmp = data[line]
        l = len(tmp)
        if l < max_length:
            for i in range(max_length-l):
                tmp.append(data[line][-1])
        else:
            tmp = tmp[:max_length]
        d[line,::] = np.array(tmp)
    layer = Normalization(axis=1)
    layer.adapt(d)
    trains = layer(d)
    return d

def read_human(path='Mousecollector/records1.txt'):
    train = pd.read_csv(path, sep=' ', header=None, encoding='utf-8', names=['data'])
    data = train['data'].apply(lambda x: [list(map(float, point.split(','))) for point in x.split(';')])
    return data_handler_VAR(data[:int(len(data)*human_ratio)]), data_handler_VAR(list(data[int(len(data)*human_ratio):]))

def read_bot(path='data/gc2.csv', return_original = False):
    train = pd.read_csv(path, sep=' ', header=None, encoding='utf-8', names=['id','data','_','1','end'])
    data = train['data'].apply(lambda x: [list(map(float, point.split(';'))) for idx, point in enumerate(x.split(',')) if idx % 2 == 0][2:-2])
    return data_handler_VAR(data[:int(len(data)*bot_ratio)]), data_handler_VAR(list(data[int(len(data)*bot_ratio):]))

def read_data():
    a, c = read_human() 
    b, d = read_bot()
    return np.append(a, b, axis=0), a.shape[0], b.shape[0], np.append(c, d, axis=0), c.shape[0], d.shape[0]

In [131]:
trains, n_human1, n_bot1, tests, n_human2, n_bot2  = read_data()
train_labels = np.array([0]*n_human1 + [1]*n_bot1).reshape((n_human1+n_bot1, 1))
test_labels = np.array([0]*n_human2 + [1]*n_bot2).reshape((n_human2+n_bot2, 1))
test_labels.shape

(860, 1)

In [132]:
n_timesteps, n_features, n_outputs = trains.shape[1], trains.shape[2], train_labels.shape[0]

In [154]:
model = Sequential()
model.add(LSTM(200, input_shape=(n_timesteps, n_features)))
#model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_14 (LSTM)              (None, 200)               162400    
                                                                 
 dense_14 (Dense)            (None, 1)                 201       
                                                                 
Total params: 162,601
Trainable params: 162,601
Non-trainable params: 0
_________________________________________________________________


In [155]:
verbose, epochs, batch_size = 0, 32, 64
model.fit(trains, train_labels, epochs=epochs, batch_size=batch_size, verbose=verbose)
# evaluate model
_, accuracy = model.evaluate(trains, train_labels, batch_size=batch_size, verbose=0)
accuracy

1.0

In [156]:
_, accuracy = model.evaluate(tests, test_labels, batch_size=batch_size, verbose=0)
accuracy

1.0

In [157]:
# testing on the old 3000 data entries for bot 
def read_old_data(path='data/dsjtzs_txfz_training.txt'):
    train = pd.read_csv(path, sep=' ', header=None, encoding='utf-8', names=['id', 'data', 'target', 'label'])
    data = train['data'].apply(lambda x: [list(map(float, point.split(',')))[:-1] for point in x.split(';')[:-1]])
    label = list(train['label'])
    
    sep = label.index(0) + 200
    human = data[:sep]
    bot = list(data[sep:])
    return data_handler_VAR(bot),  np.array(label[sep:]).reshape((len(label[sep:]),1))

In [158]:
test2, test_label2 = read_old_data()
test_label2.shape

(200, 1)

In [159]:
_, accuracy = model.evaluate(test2, test_label2, batch_size=batch_size, verbose=0)
accuracy

0.0

Still fails to identify the old bot data

In [160]:
# training on the combined data
def read_train2(path='data/dsjtzs_txfz_training.txt'):
    train = pd.read_csv(path, sep=' ', header=None, encoding='utf-8', names=['id', 'data', 'target', 'label'])
    data = train['data'].apply(lambda x: [list(map(float, point.split(',')))[:-1] for point in x.split(';')[:-1]])
    label = list(train['label'])
    
    sep = label.index(0)
    human = list(data[:sep//2])
    bot = list(data[sep+200:])
    return np.append(data_handler_VAR(human), data_handler_VAR(bot), axis=0),  np.array(label[:sep//2]+label[sep+200:]).reshape((len(label)//2,1))

In [161]:
old_train, old_labels = read_train2()
new_trains, new_train_labels = np.append(trains, old_train, axis=0), np.append(train_labels, old_labels, axis=0), 

In [162]:
verbose, epochs, batch_size = 0, 32, 64
model.fit(new_trains, new_train_labels, epochs=epochs, batch_size=batch_size, verbose=verbose)
# evaluate model
_, accuracy = model.evaluate(new_trains, new_train_labels, batch_size=batch_size, verbose=0)
accuracy

0.9386597871780396

In [163]:
# testing on the old 3000 data entries for bot 
_, accuracy = model.evaluate(test2, test_label2, batch_size=batch_size, verbose=0)
accuracy

0.5

In [164]:
_, accuracy = model.evaluate(tests, test_labels, batch_size=batch_size, verbose=0)
accuracy

1.0