In [100]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf


In [46]:
data = pd.read_csv("~/data/uba.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145300 entries, 0 to 145299
Data columns (total 21 columns):
ActivityID                145300 non-null object
ProcessName               145300 non-null object
ApplicationName           139258 non-null object
TotalSeconds              145299 non-null object
LapsedSeconds             145295 non-null object
ApplFocusInterval         145291 non-null object
currentWindowsIdentity    145291 non-null object
IsUserPartOfAdminGroup    95 non-null object
IntegrityLevel            13 non-null object
Recordings                7 non-null object
mediaUpdate               145202 non-null object
updateDateTime            145286 non-null object
Session                   145293 non-null object
CreatedDateTime           145289 non-null object
CreatedDateTimeIST        145290 non-null object
Unnamed: 15               94 non-null object
Unnamed: 16               10 non-null object
Unnamed: 17               3 non-null object
Unnamed: 18               2 non-n

In [47]:
data = data.drop(data[(~data[["Unnamed: 15","Unnamed: 16","Unnamed: 17","Unnamed: 18","Unnamed: 19","Unnamed: 20"]].isna()).any(axis=1)].index)

In [48]:
data = data.drop(["Unnamed: 15","Unnamed: 16","Unnamed: 17","Unnamed: 18","Unnamed: 19","Unnamed: 20"],axis=1)

In [49]:
data = data.drop(145299)

In [50]:
data = data.drop(55044)

In [51]:
data = data.drop(74912)

In [52]:
data = data.drop(121144)

In [53]:
data = data.drop(123328)

## Cleaned Data

In [54]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 145200 entries, 0 to 145298
Data columns (total 15 columns):
ActivityID                145200 non-null object
ProcessName               145200 non-null object
ApplicationName           139158 non-null object
TotalSeconds              145200 non-null object
LapsedSeconds             145196 non-null object
ApplFocusInterval         145196 non-null object
currentWindowsIdentity    145196 non-null object
IsUserPartOfAdminGroup    0 non-null object
IntegrityLevel            0 non-null object
Recordings                0 non-null object
mediaUpdate               145196 non-null object
updateDateTime            145196 non-null object
Session                   145196 non-null object
CreatedDateTime           145196 non-null object
CreatedDateTimeIST        145196 non-null object
dtypes: object(15)
memory usage: 17.7+ MB


data.ActivityID = pd.to_numeric(data.ActivityID, errors='coerce')

In [55]:
data.ActivityID = pd.to_numeric(data.ActivityID, errors='coerce')

In [56]:
data.TotalSeconds = pd.to_numeric(data.TotalSeconds, errors='coerce')

data.LapsedSeconds = pd.to_numeric(data.LapsedSeconds, errors='coerce')

data.ApplFocusInterval = pd.to_numeric(data.ApplFocusInterval, errors='coerce')

data.Session = pd.to_numeric(data.Session, errors='coerce')

data.updateDateTime = pd.to_datetime(data.updateDateTime, yearfirst=True)

data.CreatedDateTime = pd.to_datetime(data.CreatedDateTime, yearfirst=True)

In [57]:
data.TotalSeconds = data.TotalSeconds.replace({np.nan:0})

data.LapsedSeconds = data.LapsedSeconds.replace({np.nan:0})

data.Session = data.Session.replace({np.nan:0}).astype(np.int32)

data.ApplFocusInterval = data.ApplFocusInterval.replace({np.nan:0})

data.currentWindowsIdentity = data.currentWindowsIdentity.replace({np.nan:'Unknown'})

In [139]:
data = data.drop(data[data.updateDateTime.isna()].index)

In [59]:
data = data.drop(["IsUserPartOfAdminGroup","IntegrityLevel","Recordings"],axis=1)

In [60]:
def create_mapping(unique_vals):
    sort_data = np.sort(unique_vals)
    zipper = zip(sort_data,np.arange(sort_data.shape[0]))
    return dict(zipper)

In [61]:
def reverse_mapping(map_):
    rev_map = {}
    for k,v in map_.items():
        rev_map[v] = k
    return rev_map

In [62]:
ident_mapping = create_mapping(data.currentWindowsIdentity.unique())
data.currentWindowsIdentity = data.currentWindowsIdentity.replace(ident_mapping).astype(np.int32)
data.ApplicationName = data.ApplicationName.replace({np.nan:'Unknown'})

In [63]:
data.mediaUpdate = data.mediaUpdate.replace({np.nan:'Unknown'})
media_update_mapping = create_mapping(data.mediaUpdate.unique())
data.mediaUpdate = data.mediaUpdate.replace(media_update_mapping).astype(np.int32)

In [64]:
process_mapping = create_mapping(data.ProcessName.unique())
data.ProcessName = data.ProcessName.replace(process_mapping).astype(np.int32)

In [65]:
data["update_day"] = data.updateDateTime.dt.dayofweek
data["update_time"] = data.updateDateTime.dt.hour*60 + data.updateDateTime.dt.minute
data["update_hour"] = data.updateDateTime.dt.hour

In [66]:
data["update_week"] = data.updateDateTime.dt.weekofyear

In [67]:
data["created_hour"] = data.CreatedDateTime.dt.hour
data["created_hour_IST"] = data.CreatedDateTimeIST.dt.hour

AttributeError: Can only use .dt accessor with datetimelike values

In [None]:
data["created_day"] = data.CreatedDateTime.dt.dayofweek
data["created_day_IST"] = data.CreatedDateTimeIST.dt.dayofweek

In [None]:
data = data.drop(["updateDateTime","CreatedDateTime",'CreatedDateTimeIST'],axis=1)

In [None]:
normprocdf = data.drop(["ApplicationName"],axis=1)
normprocdf.head()
normprocdf.shape[0]

In [None]:
df = normprocdf.copy()

In [104]:
class predict_proc_NN:
    def __init__(self,dataframe):
        self.dataframe = dataframe
        self.total_process_count = self.dataframe.ProcessName.unique().shape
        self.create_one_hot_encoding(dataframe.shape[0])
        
    def create_one_hot_encoding(self, rows):
        self.encoding = np.zeros((rows,self.total_process_count[0]),dtype=np.int)
        print(rows)
        v = 0
        for i in self.dataframe.iterrows() :
            # print(i[0])
            self.encoding[v][int(i[1].ProcessName)]=1
            v+=1
        #print(self.encoding)
    
    def customize_wt(self):
        pass
    
    def train_nn(self):
        self.y = self.encoding
        np.random.seed(1)
        self.data_with_user = self.dataframe[['currentWindowsIdentity','LapsedSeconds','ApplFocusInterval','update_day','update_hour','update_week']]
        X_train, self.X_test, y_train, self.y_test = train_test_split(self.data_with_user, self.y, test_size=0.33, random_state=53)
        
        self.model = tf.keras.Sequential([
                tf.layers.Dense(150,input_shape=[6],activation=tf.nn.tanh),
                tf.layers.BatchNormalization(),
                tf.layers.Dense(320,activation=tf.nn.tanh),
                tf.layers.Dense(547,activation=tf.nn.softmax)
                ])
        self.model.compile(tf.train.GradientDescentOptimizer(0.2),tf.losses.softmax_cross_entropy)
        self.model.fit(X_train,y_train,epochs=50,verbose=1,batch_size=32)
        
    
    def predict_output(self):
        self.prediction = self.model.predict(self.X_test)
        
    def accuracy_predict(self):
        count = np.sum(np.argmax(self.prediction,axis=1) == np.argmax(self.y_test,axis=1))
        print("Accuracy:%0.4f"%(float(count)/15000))


In [106]:
nn = predict_proc_NN(df)


145196


In [110]:
nn.predict_output()
nn.accuracy_predict()

Accuracy:1.1673


In [111]:
nn.train_nn()

Epoch 1/50

KeyboardInterrupt: 

In [135]:
from numpy import random,dot,exp
class Layer():
    def __init__(self, no_neurons, inputs_per_neuron):
        self.weights = 2 * random.random((inputs_per_neuron, no_neurons)) - 1


class customLayer():
    def __init__(self, no_neurons, inputs_per_neuron):
        self.weights  = np.random.rand(inputs_per_neuron,no_neurons)
        for i in range(150):
            self.weights[0][i]=10**7
        
        
class NN():
    def __init__(self, layer1, layer2):
        self.layer1 = layer1
        self.layer2 = layer2

    def __sigmoid(self, x):
        return 1 / (1 + exp(-x))

    def __diff_sigmoid(self, x):
        return x * (1 - x)

    def train(self, training_ip, training_op, training_iterations):
        for iteration in range(training_iterations):
            print(iteration)
            output_from_layer_1, output_from_layer_2 = self.fwd(training_ip)

            layer2_error = training_op - output_from_layer_2
            layer2_delta = layer2_error * self.__diff_sigmoid(output_from_layer_2)

            layer1_error = layer2_delta.dot(self.layer2.weights.T)
            layer1_delta = layer1_error * self.__diff_sigmoid(output_from_layer_1)

            layer1_adjustment = training_ip.T.dot(layer1_delta)
            layer2_adjustment = output_from_layer_1.T.dot(layer2_delta)

            self.layer1.weights += 0.35*layer1_adjustment
            self.layer2.weights += 0.35*layer2_adjustment

    def fwd(self, inputs):
        
        output_from_layer1 = self.__sigmoid(dot(inputs, self.layer1.weights))
        output_from_layer2 = self.__sigmoid(dot(output_from_layer1, self.layer2.weights))
        #print(output_from_layer2)
        return output_from_layer1, output_from_layer2
    
    
class predict_proc_NN2:
    def __init__(self,dataframe):
        self.dataframe = dataframe
        self.total_process_count = self.dataframe.ProcessName.unique().shape
        self.create_one_hot_encoding(dataframe.shape[0])
        self.layer1  =  customLayer(150,6)
        self.layer2 = Layer(547,150)
        
    def create_one_hot_encoding(self, rows):
        self.encoding = np.zeros((rows,self.total_process_count[0]),dtype=np.int)
        print(rows)
        v = 0
        for i in self.dataframe.iterrows() :
            # print(i[0])
            self.encoding[v][int(i[1].ProcessName)]=1
            v+=1
        #print(self.encoding)
    
    def customize_wt(self):
        pass
    
    def train_nn(self):
        self.np_dataset = self.dataframe.values
        indices = np.random.choice(len(self.np_dataset), len(self.np_dataset), replace=False)
        X_values = self.np_dataset[indices]
        y_values = self.encoding[indices]

        test_size = int(self.dataframe.shape[0]*0.20)
        self.X_test = X_values[-test_size:]
        X_train = X_values[:-test_size]
        self.y_test = y_values[-test_size:]
        y_train = y_values[:-test_size]
        
        self.test_count = test_size
        neural_network = NN(self.layer1, self.layer2)
        neural_network.train(X_train, y_train, 1000)
    
    def predict_output(self):
        self.y_op1,self.yop_2 = neural_network.fwd(self.X_test)
    
    def calculate_accuracy(self):
        count =  np.sum(np.argmax(self.yop_2,axis=1) == np.argmax(self.y_test,axis=1))
        print("Accuaracy:%0.6f"%(float(count)/self.test_count))

        
    

In [137]:
nn2 = predict_proc_NN2(df)

145196


In [138]:
nn2.train_nn()

0


ValueError: shapes (116157,14) and (6,150) not aligned: 14 (dim 1) != 6 (dim 0)