In [18]:
%%bash
pip install kafka-python



In [19]:
from kafka import KafkaConsumer
import json
import re
from datetime import datetime
import pickle
import csv
import numpy as np

In [20]:
consumer = KafkaConsumer(
    bootstrap_servers='localhost:9092',
    auto_offset_reset='earliest',
    value_deserializer=lambda m: json.loads(m.decode('ascii','ignore'))
)

In [21]:
consumer.subscribe('jaeger-spans')
f = open('log_prediction_vr.csv', 'a', encoding='utf-8')
header_writer = csv.writer(f)
header_writer.writerow(["Date","Time","Duration","Comment","PredictedLabel"])
f.close()

In [22]:
# load svm model from folder
filename = 'svm_model_vr'
svm_model = pickle.load(open(filename, 'rb'))

In [23]:
# load xgb model from folder
filename2 = 'xgb_model_vr'
xgb_model = pickle.load(open(filename2, 'rb'))

In [24]:
# load logreg model from folder
filename3 = 'logreg_model_vr'
logreg_model = pickle.load(open(filename3, 'rb'))

In [25]:
# load dt model from folder
filename4 = 'dt_model_vr'
dt_model = pickle.load(open(filename4, 'rb'))

In [26]:
# load knn model from folder
filename5 = 'knn_model_vr'
knn_model = pickle.load(open(filename5, 'rb'))

In [27]:
#load scaler
filename_scaler = 'scaler.sav'
scaler = pickle.load(open(filename_scaler,'rb'))

In [28]:
#Custom vectorizer helper functions
def lowercase_convert(comment):
    return comment.lower()

def count_equal(comment):
    total_words = 0
    total = 0
    for i in comment:
        total_words+=1
        if i=="=":
            total+=1
    return total/total_words

def count_openbracket(comment):
    total_words = 0
    total = 0
    for i in comment:
        total_words+=1
        if i=="<":
            total+=1
    return total/total_words

def count_closebracket(comment):
    total_words = 0
    total = 0
    for i in comment:
        total_words+=1
        if i==">":
            total+=1
    return total/total_words

def count_openbracket2(comment):
    total_words = 0
    total = 0
    for i in comment:
        total_words+=1
        if i=="(":
            total+=1
    return total/total_words

def count_closebracket2(comment):
    total_words = 0
    total = 0
    for i in comment:
        total_words+=1
        if i==")":
            total+=1
    return total/total_words

def count_int(comment):
    total_words = 0
    total = 0
    for i in comment:
        total_words+=1
        if i.isdigit():
            total+=1
    return total/total_words

def count_div(comment):
    total_words = 0
    total = 0
    for i in comment:
        total_words+=1
        if i=="div":
            total+=1
    return total/total_words
            
def count_at(comment):
    total_words = 0
    total = 0
    for i in comment:
        total_words+=1
        if i=="@":
            total+=1
    return total/total_words

In [29]:
#Function to preprocess sample comments for each model 

def test_xgb(comment,duration,statuscode):
    lowcomment = lowercase_convert(comment)
    at = count_at(lowcomment)
    equal = count_equal(lowcomment)
    openbracket = count_openbracket(lowcomment)
    openbracket2 = count_openbracket2(lowcomment)
    closebracket = count_closebracket(lowcomment)
    closebracket2 = count_closebracket2(lowcomment)
    intcount = count_int(lowcomment)
    div = count_div(lowcomment)

    data_input = np.reshape(np.array([duration,statuscode,0,equal,openbracket,closebracket,openbracket2,closebracket2,intcount,div,at]),(1,11))
   

    X = scaler.transform(data_input)
    #openbracket, openbracket2, equal
    final_input = np.delete(X,[0,1,2,5,7,8,9,10],axis=1)
    final_input[:,[0,1,2]] = final_input[:,[1,2,0]]
   

    return xgb_model.predict(final_input)

In [30]:
def test_svm(comment,duration,statuscode):
    lowcomment = lowercase_convert(comment)
    at = count_at(lowcomment)
    equal = count_equal(lowcomment)
    openbracket = count_openbracket(lowcomment)
    openbracket2 = count_openbracket2(lowcomment)
    closebracket = count_closebracket(lowcomment)
    closebracket2 = count_closebracket2(lowcomment)
    intcount = count_int(lowcomment)
    div = count_div(lowcomment)

    data_input = np.reshape(np.array([duration,statuscode,0,equal,openbracket,closebracket,openbracket2,closebracket2,intcount,div,at]),(1,11))
   
    X = scaler.transform(data_input)
    #openbracket, at ,equal
    final_input = np.delete(X,[0,1,2,5,6,7,8,9],axis=1)
    final_input[:,[0,1,2]] = final_input[:,[1,2,0]]


   

    return svm_model.predict(final_input)

In [31]:
def test_logreg(comment,duration,statuscode):
    lowcomment = lowercase_convert(comment)
    at = count_at(lowcomment)
    equal = count_equal(lowcomment)
    openbracket = count_openbracket(lowcomment)
    openbracket2 = count_openbracket2(lowcomment)
    closebracket = count_closebracket(lowcomment)
    closebracket2 = count_closebracket2(lowcomment)
    intcount = count_int(lowcomment)
    div = count_div(lowcomment)

    data_input = np.reshape(np.array([duration,statuscode,0,equal,openbracket,closebracket,openbracket2,closebracket2,intcount,div,at]),(1,11))
   
    X = scaler.transform(data_input)
    #closebracket2,openbracket2,equal
    final_input = np.delete(X,[0,1,2,4,5,8,9,10],axis=1)
    final_input[:,[0,1,2]] = final_input[:,[2,1,0]]
    
    return logreg_model.predict(final_input)

In [32]:
def test_dt(comment,duration,statuscode):
    lowcomment = lowercase_convert(comment)
    at = count_at(lowcomment)
    equal = count_equal(lowcomment)
    openbracket = count_openbracket(lowcomment)
    openbracket2 = count_openbracket2(lowcomment)
    closebracket = count_closebracket(lowcomment)
    closebracket2 = count_closebracket2(lowcomment)
    intcount = count_int(lowcomment)
    div = count_div(lowcomment)

    data_input = np.reshape(np.array([duration,statuscode,0,equal,openbracket,closebracket,openbracket2,closebracket2,intcount,div,at]),(1,11))
   
    X = scaler.transform(data_input)
    #duration, closebracket2, equal
    final_input = np.delete(X,[1,2,4,5,6,8,9,10],axis=1)
    final_input[:,[0,1,2]] = final_input[:,[0,2,1]]

    return dt_model.predict(final_input)

In [33]:
def test_knn(comment,duration,statuscode):
    lowcomment = lowercase_convert(comment)
    at = count_at(lowcomment)
    equal = count_equal(lowcomment)
    openbracket = count_openbracket(lowcomment)
    openbracket2 = count_openbracket2(lowcomment)
    closebracket = count_closebracket(lowcomment)
    closebracket2 = count_closebracket2(lowcomment)
    intcount = count_int(lowcomment)
    div = count_div(lowcomment)

    data_input = np.reshape(np.array([duration,statuscode,0,equal,openbracket,closebracket,openbracket2,closebracket2,intcount,div,at]),(1,11))
   
    X = scaler.transform(data_input)
    #closebracket2,at, equal
    final_input = np.delete(X,[0,1,2,4,5,6,8,9],axis=1)
    final_input[:,[0,1,2]] = final_input[:,[1,2,0]]

    return knn_model.predict(final_input)

In [34]:
#Function to determine most common value in an array
def most_frequent(List):
    counter = 0
    num = List[0]
     
    for i in List:
        curr_frequency = List.count(i)
        if(curr_frequency> counter):
            counter = curr_frequency
            num = i
 
    return num

In [35]:
counter = 1

for message in consumer:
    # message value and key are raw bytes -- decode if necessary!
    # e.g., for unicode: `message.value.decode('utf-8')`
    # print ("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, message.offset, message.key, message.value))
    
    if message.value['process']['serviceName'] == 'frontend':
        
        # open the file in the append mode
        f1 = open('log_prediction_vr.csv', 'a', encoding='utf-8')
        # create the csv writer
        writer = csv.writer(f1)
                
        http_method = re.findall(r"{'key': 'http\.method', 'vStr': '(.+?)'}",str(message.value))
        http_target = re.findall(r"{'key': 'http\.target', 'vStr': '(.+?)'}",str(message.value))
        http_status_code = re.findall(r"{'key': 'http\.status_code', 'vType': '.*?', 'vInt64': '(\d+)'}",str(message.value))
        http_url = re.findall(r"{'key': 'http\.url', 'vStr': '(.+?)'}",str(message.value))
        duration = re.findall(r"(.+?)s",str(message.value['duration']))
        
        if len(http_method) > 0:
            http_method = http_method[0]
        if len(http_target) > 0:
            http_target = http_target[0]
        if len(http_url) > 0:
            http_url = http_url[0]
        if len(http_status_code) > 0:
            http_status_code = http_status_code[0]
        if len(duration) > 0:
            duration = duration[0]
        
        if http_method == 'POST':
            if "comment" in http_url:
                comment = ""
                fields = message.value['logs'][0]['fields']
                description1 = re.findall(r"{'key': 'body', 'vStr': '(.+?)'}",str(fields))
                if len(description1) > 0:
                    temp = re.findall(r'"description":"(.+?)","_links":',str(description1[0]))
                    if len(temp) > 0:
                        comment = temp[0]

                data = [duration,comment]


                # PREDICTION
                duration = data[0]
                comment_orig = data[1]
                
                data_predxgb = test_xgb(comment_orig,duration,201)
                data_predsvm = test_svm(comment_orig,duration,201)
                data_predlogreg = test_logreg(comment_orig,duration,201)
                data_preddt= test_dt(comment_orig,duration,201)
                data_predknn = test_knn(comment_orig,duration,201)

                # COMPARING PREDICTIONS
                all_predictions = [data_predxgb[0],data_predsvm[0],data_predlogreg[0],data_preddt[0],data_predknn[0]]
                print(all_predictions)
                resultvalue = most_frequent(all_predictions)

                # DISPLAY RESULTS
                if (resultvalue==0.0):
                    result = " - Normal Comment"
                else:
                    result = " - Anomalous Comment"

                date = datetime.now().strftime('%Y-%m-%d')
                time = datetime.now().strftime('%H:%M:%S')
                print(counter, date, time, duration, comment_orig, result)

                log_prediction = [date,time,duration,comment_orig,result]
                # write a row to the csv file
                writer.writerow(log_prediction)
                counter = counter + 1


[0.0, 0.0, 0.0, 0.0, 0.0]
1 2021-09-22 14:22:05 0.142700 I love this movie. I will watch it again later!  - Normal Comment
[1.0, 1.0, 0.0, 1.0, 1.0]
2 2021-09-22 14:22:35 0.038200100 <br>@keyframes x{}</br><title style=\\\\\\\\\\"\\"animation-name:y\\\\\\\\\\"\\" onanimationend=\\\\\\\\\\"\\"alert(111)\\\\\\\\\\"\\"></br>  - Anomalous Comment
