In [8]:
import tensorflow as tf
from sklearn.metrics import log_loss, accuracy_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [9]:
df= pd.read_csv('input/bugzilla.csv')

In [10]:
def normalize(x):
  x = x.astype(float)
  min = np.min(x)
  max = np.max(x)
  return (x - min)/(max-min)

In [11]:
def view_values(X, y, example):
    label = y.loc[example]
    image = X.loc[example,:].values.reshape([-1,1])
    print(image)

In [12]:
print("Shape of dataframe: ", df.shape)

Shape of dataframe:  (4620, 17)


In [13]:
df.describe()

Unnamed: 0,transactionid,ns,nm,nf,entropy,la,ld,lt,fix,ndev,pd,npt,exp,rexp,sexp,bug
count,4620.0,4620.0,4620.0,4620.0,4620.0,4620.0,4620.0,4620.0,4620.0,4620.0,4620.0,4620.0,4620.0,4620.0,4620.0,4620.0
mean,5819.241558,1.169913,1.220346,2.287662,0.229153,0.07141,3.113758,591.379777,0.859957,16.424892,173.927922,0.952656,342.570563,253.033361,230.821429,0.3671
std,4223.181222,0.424315,0.569805,4.275243,0.371861,0.491756,198.6246,547.895977,0.34707,10.74343,646.256119,0.137676,392.273355,268.292945,269.729476,0.482066
min,3.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.037037,1.0,1.0,1.0,0.0
25%,2211.75,1.0,1.0,1.0,0.0,0.003093,0.001295,210.0,1.0,7.0,3.0,1.0,58.0,52.0,35.0,0.0
50%,5193.5,1.0,1.0,1.0,0.0,0.009909,0.005109,455.0,1.0,16.0,20.0,1.0,196.0,156.0,129.5,0.0
75%,8804.25,1.0,1.0,2.0,0.551098,0.035356,0.017502,799.25,1.0,24.0,119.0,1.0,489.0,352.0,341.25,1.0
max,20938.0,4.0,8.0,63.0,1.0,21.0,13495.0,2751.0,1.0,47.0,15836.0,1.0,1815.0,1042.75,1741.0,1.0


In [14]:
class RBM(object):
    
    def __init__(self, input_size, output_size, 
                 learning_rate, epochs, batchsize):

        self._input_size = input_size
        self._output_size = output_size
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batchsize = batchsize
        
        # Initialize weights and biases using zero matrices
        self.w = np.zeros([input_size, output_size], dtype=np.float32)
        self.hb = np.zeros([output_size], dtype=np.float32)
        self.vb = np.zeros([input_size], dtype=np.float32)
    # forward pass, where h is the hidden layer and v is the visible layer
    def prob_h_given_v(self, visible, w, hb):
        return tf.nn.sigmoid(tf.matmul(visible, w) + hb)
    # backward pass
    def prob_v_given_h(self, hidden, w, vb):
        return tf.nn.sigmoid(tf.matmul(hidden, tf.transpose(w)) + vb)
    # sampling function
    def sample_prob(self, probs):
        return tf.nn.relu(tf.sign(probs - tf.random_uniform(tf.shape(probs))))

    def train(self, X):
        _w = tf.placeholder(tf.float32, [self._input_size, self._output_size])
        _hb = tf.placeholder(tf.float32, [self._output_size])
        _vb = tf.placeholder(tf.float32, [self._input_size])
        
        prv_w = np.zeros([self._input_size, self._output_size], dtype=np.float32)
        prv_hb = np.zeros([self._output_size], dtype=np.float32)
        prv_vb = np.zeros([self._input_size], dtype=np.float32)
        
        cur_w = np.zeros([self._input_size, self._output_size], dtype=np.float32)
        cur_hb = np.zeros([self._output_size], dtype=np.float32)
        cur_vb = np.zeros([self._input_size], dtype=np.float32)
        
        v0 = tf.placeholder(tf.float32, [None, self._input_size])
        h0 = self.sample_prob(self.prob_h_given_v(v0, _w, _hb))
        v1 = self.sample_prob(self.prob_v_given_h(h0, _w, _vb))
        h1 = self.prob_h_given_v(v1, _w, _hb)
        #To update the weights, we perform constrastive divergence.
        positive_grad = tf.matmul(tf.transpose(v0), h0)
        negative_grad = tf.matmul(tf.transpose(v1), h1)
        
        update_w = _w + self.learning_rate * (positive_grad - negative_grad) / tf.to_float(tf.shape(v0)[0])
        update_vb = _vb +  self.learning_rate * tf.reduce_mean(v0 - v1, 0)
        update_hb = _hb +  self.learning_rate * tf.reduce_mean(h0 - h1, 0)
        # We also define the error as the MSE
        err = tf.reduce_mean(tf.square(v0 - v1))
        
        error_list = []

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            
            for epoch in range(self.epochs):
                for start, end in zip(range(0, len(X),  self.batchsize),range(self.batchsize,len(X), self.batchsize)):
                    batch = X[start:end]
                    cur_w = sess.run(update_w, feed_dict={v0: batch, _w: prv_w, _hb: prv_hb, _vb: prv_vb})
                    cur_hb = sess.run(update_hb, feed_dict={v0: batch,  _w: prv_w, _hb: prv_hb, _vb: prv_vb})
                    cur_vb = sess.run(update_vb, feed_dict={v0: batch, _w: prv_w, _hb: prv_hb, _vb: prv_vb})
                    prv_w = cur_w
                    prv_hb = cur_hb
                    prv_vb = cur_vb
                error = sess.run(err, feed_dict={v0: X, _w: cur_w, _vb: cur_vb, _hb: cur_hb})
                print ('Epoch: %d' % epoch,'reconstruction error: %f' % error)
                error_list.append(error)
            self.w = prv_w
            self.hb = prv_hb
            self.vb = prv_vb
            return error_list
    #function to generate new features from the generative model that the RBM has learned
    def rbm_output(self, X):
        
        input_X = tf.constant(X)
        _w = tf.constant(self.w)
        _hb = tf.constant(self.hb)
        _vb = tf.constant(self.vb)
        out = tf.nn.sigmoid(tf.matmul(input_X, _w) + _hb)
        hiddenGen = self.sample_prob(self.prob_h_given_v(input_X, _w, _hb))
        visibleGen = self.sample_prob(self.prob_v_given_h(hiddenGen, _w, _vb))
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            return sess.run(out), sess.run(visibleGen), sess.run(hiddenGen)

In [15]:
df=df.drop(['commitdate','transactionid'], axis=1)

In [16]:
train_X = df.iloc[:,:-1].apply(func=normalize, axis=0)
train_Y = df.iloc[:,-1]

In [17]:
print(df.head())
df.shape

   ns  nm  nf   entropy        la        ld          lt  fix  ndev     pd  \
0   1   1   3  0.579380  0.093620  0.000000  480.666667    1    14    596   
1   1   1   1  0.000000  0.000000  0.000000  398.000000    1     1      0   
2   3   3  52  0.739279  0.183477  0.208913  283.519231    0    23  15836   
3   1   1   8  0.685328  0.016039  0.012880  514.375000    1    21   1281   
4   2   2  38  0.769776  0.091829  0.072746  366.815789    1    21   6565   

        npt  exp    rexp  sexp  bug  
0  0.666667  143  133.50   129    1  
1  1.000000  140  140.00   137    1  
2  0.750000  984  818.65   978    0  
3  1.000000  579  479.25   550    0  
4  0.763158  413  313.25   405    0  


(4620, 15)

In [18]:
inputX = df.iloc[:,:-1].apply(func=normalize, axis=0).values
inputY= df.iloc[:,-1].values
print(type(inputX))
inputX = inputX.astype(np.float32)

#List to hold RBMs
rbm_list = []

#define parameters of RBMs we will train
# 14-20-12-12-2

rbm_list.append(RBM(14, 20, 0.002, 200, 100))
rbm_list.append(RBM(20, 12, 0.002, 200, 100))
rbm_list.append(RBM(12, 12, 0.002, 200, 100))

<class 'numpy.ndarray'>


In [19]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior() 

Instructions for updating:
non-resource variables are not supported in the long term


In [20]:
outputList = []
error_list = []

#For each RBM in out list
for i in range(0, len(rbm_list)):
    print('RBM', i+1)
    #Train new RBM
    rbm = rbm_list[i]
    err = rbm.train(inputX)
    error_list.append(err)

    #Return output layer
    #sess.run(out), sess.run(visibleGen), sess.run(hiddenGen)
    outputX, reconstructedX, hiddenX = rbm.rbm_output(inputX)
    outputList.append(outputX)
    inputX= hiddenX

RBM 1
Instructions for updating:
Use `tf.cast` instead.
Epoch: 0 reconstruction error: 0.405829
Epoch: 1 reconstruction error: 0.371015
Epoch: 2 reconstruction error: 0.342177
Epoch: 3 reconstruction error: 0.317912
Epoch: 4 reconstruction error: 0.294592
Epoch: 5 reconstruction error: 0.277995
Epoch: 6 reconstruction error: 0.262601
Epoch: 7 reconstruction error: 0.249755
Epoch: 8 reconstruction error: 0.236498
Epoch: 9 reconstruction error: 0.229906
Epoch: 10 reconstruction error: 0.219831
Epoch: 11 reconstruction error: 0.210117
Epoch: 12 reconstruction error: 0.205498
Epoch: 13 reconstruction error: 0.199783
Epoch: 14 reconstruction error: 0.193146
Epoch: 15 reconstruction error: 0.186822
Epoch: 16 reconstruction error: 0.182999
Epoch: 17 reconstruction error: 0.181354
Epoch: 18 reconstruction error: 0.175588
Epoch: 19 reconstruction error: 0.173575
Epoch: 20 reconstruction error: 0.171473
Epoch: 21 reconstruction error: 0.168823
Epoch: 22 reconstruction error: 0.166191
Epoch: 23 r

In [None]:
i = 1
for err in error_list:
    print("RBM",i)
    pd.Series(err).plot(logy=False)
    plt.xlabel("Epoch")
    plt.ylabel("Reconstruction Error")
    plt.show()
    i += 1

In [None]:
inputX = np.array(train_X)
inputX = inputX.astype(np.float32)
rbmOne = rbm_list[0]

In [None]:
print('RBM 1')
outputX_rbmOne, reconstructedX_rbmOne, hiddenX_rbmOne = rbmOne.rbm_output(inputX)
reconstructedX_rbmOne = pd.DataFrame(data=reconstructedX_rbmOne, index=train_X.index)
for j in range(0,1):
    example = j
    print("Data generated by First RBM Layer")
    view_values(reconstructedX_rbmOne, train_Y, example)
    print("Original Data")
    view_values(train_X, train_Y, example)

In [None]:
reconstructedX_rbmOne.shape

In [None]:
class DBN(object):
    def __init__(self, original_input_size, input_size, output_size, 
                 learning_rate, epochs, batchsize, rbmOne, rbmTwo, rbmThree):
        # Define hyperparameters
        self._original_input_size = original_input_size
        self._input_size = input_size
        self._output_size = output_size
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batchsize = batchsize
        self.rbmOne = rbmOne
        self.rbmTwo = rbmTwo
        self.rbmThree = rbmThree
    
        self.w = np.zeros([input_size, output_size], "float")
        self.hb = np.zeros([output_size], "float")
        self.vb = np.zeros([input_size], "float")
    
    def prob_h_given_v(self, visible, w, hb):
        return tf.nn.sigmoid(tf.matmul(visible, w) + hb)

    def prob_v_given_h(self, hidden, w, vb):
        return tf.nn.sigmoid(tf.matmul(hidden, tf.transpose(w)) + vb)
    
    def sample_prob(self, probs):
        return tf.nn.relu(tf.sign(probs - tf.random_uniform(tf.shape(probs))))

    def train(self, X):
        _w = tf.placeholder("float", [self._input_size, self._output_size])
        _hb = tf.placeholder("float", [self._output_size])
        _vb = tf.placeholder("float", [self._input_size])
        
        prv_w = np.zeros([self._input_size, self._output_size], "float")
        prv_hb = np.zeros([self._output_size], "float")
        prv_vb = np.zeros([self._input_size], "float")
        
        cur_w = np.zeros([self._input_size, self._output_size], "float")
        cur_hb = np.zeros([self._output_size], "float")
        cur_vb = np.zeros([self._input_size], "float")
        
        v0 = tf.placeholder("float", [None, self._original_input_size])

        forwardOne = tf.nn.relu(tf.sign(tf.nn.sigmoid(tf.matmul(v0, self.rbmOne.w) + self.rbmOne.hb) - tf.random_uniform(tf.shape(tf.nn.sigmoid(tf.matmul(v0, self.rbmOne.w) + self.rbmOne.hb)))))
        forwardTwo = tf.nn.relu(tf.sign(tf.nn.sigmoid(tf.matmul(forwardOne, self.rbmTwo.w) + self.rbmTwo.hb) - tf.random_uniform(tf.shape(tf.nn.sigmoid(tf.matmul(forwardOne, self.rbmTwo.w) + self.rbmTwo.hb)))))
        forward = tf.nn.relu(tf.sign(tf.nn.sigmoid(tf.matmul(forwardTwo, self.rbmThree.w) + self.rbmThree.hb) - tf.random_uniform(tf.shape(tf.nn.sigmoid(tf.matmul( forwardTwo, self.rbmThree.w) + self.rbmThree.hb)))))
        h0 = self.sample_prob(self.prob_h_given_v(forward, _w, _hb))
        v1 = self.sample_prob(self.prob_v_given_h(h0, _w, _vb))
        h1 = self.prob_h_given_v(v1, _w, _hb)
        
        positive_grad = tf.matmul(tf.transpose(forward), h0)
        negative_grad = tf.matmul(tf.transpose(v1), h1)
        
        update_w = _w + self.learning_rate * (positive_grad - negative_grad) / tf.to_float(tf.shape(forward)[0])
        update_vb = _vb +  self.learning_rate * tf.reduce_mean(forward - v1, 0)
        update_hb = _hb +  self.learning_rate * tf.reduce_mean(h0 - h1, 0)
        
        backwardOne = tf.nn.relu(tf.sign(tf.nn.sigmoid(tf.matmul(v1, self.rbmThree.w.T) + self.rbmThree.vb) - tf.random_uniform(tf.shape(tf.nn.sigmoid(tf.matmul(v1, self.rbmThree.w.T) + self.rbmThree.vb)))))
        backwardTwo = tf.nn.relu(tf.sign(tf.nn.sigmoid(tf.matmul(backwardOne, self.rbmTwo.w.T) + self.rbmTwo.vb) - tf.random_uniform(tf.shape(tf.nn.sigmoid(tf.matmul(backwardOne, self.rbmTwo.w.T) + self.rbmTwo.vb)))))
        backward = tf.nn.relu(tf.sign(tf.nn.sigmoid(tf.matmul(backwardTwo, self.rbmOne.w.T) + self.rbmOne.vb) - tf.random_uniform(tf.shape(tf.nn.sigmoid(tf.matmul(backwardTwo, self.rbmOne.w.T) + self.rbmOne.vb)))))
        
        err = tf.reduce_mean(tf.square(v0 - backward))
        error_list = []
        
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            for epoch in range(self.epochs):
                for start, end in zip(range(0, len(X), self.batchsize), range(self.batchsize,len(X), self.batchsize)):
                    batch = X[start:end]
                    cur_w = sess.run(update_w, feed_dict={v0: batch, _w: prv_w, _hb: prv_hb, _vb: prv_vb})
                    cur_hb = sess.run(update_hb, feed_dict={v0: batch, _w:  prv_w, _hb: prv_hb, _vb: prv_vb})
                    cur_vb = sess.run(update_vb, feed_dict={v0: batch, _w: prv_w, _hb: prv_hb, _vb: prv_vb})
                    prv_w = cur_w
                    prv_hb = cur_hb
                    prv_vb = cur_vb
                error = sess.run(err, feed_dict={v0: X, _w: cur_w, _vb: cur_vb, _hb: cur_hb})
                print ('Epoch: %d' % (epoch+1),'reconstruction error: %f' % error)
                error_list.append(error)
            self.w = prv_w
            self.hb = prv_hb
            self.vb = prv_vb
            return error_list
        
    def dbn_output(self, X):

        input_X = tf.constant(X)
        forwardOne = tf.nn.sigmoid(tf.matmul(input_X, self.rbmOne.w) + self.rbmOne.hb)
        forwardTwo = tf.nn.sigmoid(tf.matmul(forwardOne, self.rbmTwo.w) + self.rbmTwo.hb)
        forward = tf.nn.sigmoid(tf.matmul(forwardTwo, self.rbmThree.w) + self.rbmThree.hb)

        _w = tf.constant(self.w)
        _hb = tf.constant(self.hb)
        _vb = tf.constant(self.vb)

        out = tf.nn.sigmoid(tf.matmul(forward, _w) + _hb)
        hiddenGen = self.sample_prob(self.prob_h_given_v(forward, _w, _hb))
        visibleGen = self.sample_prob(self.prob_v_given_h(hiddenGen, _w, _vb))

        backwardTwo = tf.nn.sigmoid(tf.matmul(visibleGen, self.rbmThree.w.T) + self.rbmThree.vb)
        backwardOne = tf.nn.sigmoid(tf.matmul(backwardTwo, self.rbmTwo.w.T) + self.rbmTwo.vb)
        backward = tf.nn.sigmoid(tf.matmul(backwardOne, self.rbmOne.w.T) + self.rbmOne.vb)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            return sess.run(out), sess.run(backward)

In [None]:
dbn = DBN(14, 12, 12, 0.02, 50, 100, rbm_list[0], rbm_list[1], rbm_list[2])

In [None]:
inputX = np.array(inputX)
error_list = []
error_list = dbn.train(inputX)

In [None]:
print("DBN")
pd.Series(error_list).plot(logy=False)
plt.xlabel("Epoch")
plt.ylabel("Reconstruction Error")
plt.show()

In [None]:
train_X.shape
train_Y.head

In [None]:
outputX_dbn, reconstructedX_dbn = dbn.dbn_output(inputX)

In [None]:
def sigmoid(x):
    return 1.0/(1.0 + np.exp(-x))

def hypothesis(X, theta) :
    """ 
    X-entire array(m,n+1)
    n+1^ 1 dummy feature Xo

    theta- np.array(n+1,1)
    """
    return sigmoid(np.dot(X, theta))

def error(X,y,theta):
    """
    params:
    X:(m,n+1)
    y:(m,1)
    theta:(n+1,1)

    return:scale_value=loss
    """
    hi = hypothesis(X,theta)
    error= -1* np.mean ( y * np.log(hi) + ( ( 1 - y ) * (np.log( 1 - hi )) ) )

    return error

In [None]:
def gradient(X,y,theta):
    """
    params:
    X:(m,n+1)
    y:(m,1)

    theta:(n+1,1)
     
    return:gradient_vector: (n+1,1)
    """
    hi = hypothesis(X,theta)

    grad = np.dot(X.T,(y-hi))
    m=X.shape[0]

    return grad/m

def gradient_descent(X,y,lr=0.02,max_itr=500):
    
    n=X.shape[1] 
    theta = np.zeros((n,1))
    
    error_list= []
    
    for i in range(max_itr):
        err = error(X,y,theta)
        error_list.append(err)

        grad = gradient(X,y,theta)
        #update theta
        theta = theta + lr * grad
    return (theta, error_list)

In [None]:
ones = np.ones((outputX_dbn.shape[0],1))
X_New_Train = np.hstack((ones,outputX_dbn))
X_New_Train = X_New_Train[:4000,:]
Y_Train= Y_Train.reshape((-1,1))