In [1]:
from kafka import KafkaConsumer, KafkaProducer
import json
import re
from datetime import datetime
import nltk
import pickle
from nltk.corpus import stopwords
import csv
import numpy as np

from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
consumer = KafkaConsumer(
    bootstrap_servers='localhost:9092',
    auto_offset_reset='earliest',
    value_deserializer=lambda m: json.loads(m.decode('ascii','ignore'))
)

In [3]:
consumer.subscribe('jaeger-spans')

In [4]:
import json
from json import JSONEncoder
import numpy

class NumpyArrayEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, numpy.ndarray):
            return obj.tolist()
        return JSONEncoder.default(self, obj)


In [5]:
nltk.download('stopwords')
stop = stopwords.words('english')

# open the file in the read mode
f = open('final_log_data.csv', 'r')

# create the csv reader
csv_reader = csv.reader(f)
next(csv_reader, None)  # skip the headers

num_of_comments = 1000
i = 1
comments = []
durations = []
labels = []
for row in csv_reader:
    comment = row[2]
    duration = row[0]
    label = row[3]

    
    comment = re.sub(r'\n', '', comment)
    comment = re.sub(r'<br />', '', comment)

    # Converting to Lowercase
    comment = comment.lower()
    
    comment = re.sub(r'(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?', '', comment)
    comment = re.sub(r'\d+', '', comment)
    
    # remove stop words
    comment = ' '.join([word for word in comment.split() if word not in (stop)])
    
    comments.append(comment)
    durations.append(duration)
    labels.append(label)

    i = i + 1
    if i > num_of_comments:
        break

f.close()

token = Tokenizer()
token.fit_on_texts(np.array(comments))

comments_seq = token.texts_to_sequences(np.array(comments))
comments_pad_seq = pad_sequences(comments_seq, maxlen=300)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vmadmin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
counter = 1

classifier = keras.models.load_model('lstm_model.h5')

for message in consumer:
    # message value and key are raw bytes -- decode if necessary!
    # e.g., for unicode: `message.value.decode('utf-8')`
    # print ("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, message.offset, message.key, message.value))
    
    if message.value['process']['serviceName'] == 'frontend':
        
        # open the file in the append mode
        f1 = open('log_prediction.csv', 'a', encoding='utf-8')
        # create the csv writer
        writer = csv.writer(f1)
                
        http_method = re.findall(r"{'key': 'http\.method', 'vStr': '(.+?)'}",str(message.value))
        http_target = re.findall(r"{'key': 'http\.target', 'vStr': '(.+?)'}",str(message.value))
        http_status_code = re.findall(r"{'key': 'http\.status_code', 'vType': '.*?', 'vInt64': '(\d+)'}",str(message.value))
        http_url = re.findall(r"{'key': 'http\.url', 'vStr': '(.+?)'}",str(message.value))
        duration = re.findall(r"(.+?)s",str(message.value['duration']))
        
        if len(http_method) > 0:
            http_method = http_method[0]
        if len(http_target) > 0:
            http_target = http_target[0]
        if len(http_url) > 0:
            http_url = http_url[0]
        if len(http_status_code) > 0:
            http_status_code = http_status_code[0]
        if len(duration) > 0:
            duration = duration[0]
        
        if http_method == 'POST':
            if "comment" in http_url:
                comment = ""
                fields = message.value['logs'][0]['fields']
                description1 = re.findall(r"{'key': 'body', 'vStr': '(.+?)'}",str(fields))
                if len(description1) > 0:
                    temp = re.findall(r'"description":"(.+?)","_links":',str(description1[0]))
                    user_id = re.findall(r'"userId":(.+?),"movieId":',str(description1[0]))
                    movie_id = re.findall(r'"movieId":(.+?),"description":',str(description1[0]))
                    
                    if len(temp) > 0:
                        comment = temp[0]
                        user_id = user_id[0]
                        movie_id = movie_id[0]
                
                
                data = [duration,comment]


                # PREDICTION

                duration = data[0]
                comment_orig = data[1]


                comment = re.sub(r'\n', '', comment_orig)
                comment = re.sub(r'<br />', '', comment)

                # Converting to Lowercase
                comment = comment.lower()

                comment = re.sub(r'(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?', '', comment)
                comment = re.sub(r'\d+', '', comment)

                # remove stop words
                comment = ' '.join([word for word in comment.split() if word not in (stop)])
                comment_seq = token.texts_to_sequences(np.array(comment).reshape(-1))
                comment_pad_seq = pad_sequences(comment_seq, maxlen=300)
                
                comment_features = comment_pad_seq
                

                # A = np.array([duration], dtype=float)[:,None]
                # data = np.concatenate((A, comment_features), axis=1)
                data = comment_features
                data_prob = classifier.predict(data)
                data_pred = np.argmax(data_prob, axis=1)

                date = datetime.now().strftime('%Y-%m-%d')
                time = datetime.now().strftime('%H:%M:%S')
                
                numpyData = {"array": data,"comment_date": date,'comment':comment,'spanId':message.value['spanId'],'userId':user_id,'movieId':movie_id}
                
                print(counter, date, time, duration, comment_orig, data_pred[0])
                
                result = data_pred[0]
                if result==0:
                    print("This looks safe")
                elif result==1:
                    print("This can be Anomalous")
                
                log_prediction = [date,time,duration,comment_orig,data_pred[0]]
                # write a row to the csv file
                writer.writerow(log_prediction)
                
                counter = counter + 1


1 2022-02-02 18:08:27 0.888699900 In my opinion this is Marvel\'s most epic and darkest entry in it\'s long line of superhero movies. From its brilliantly choreographed, heart-pounding action scenes to it\'s hilarious character interactions Avengers infinity war will satisfy any superhero enthusiast. The part that really stood out for mw is how Russo Brothers portrayed Thanos - a menacing but genius villian with movie that eaves us philosophying about our existense. A must watch! 0
This looks safe
2 2022-02-02 18:08:27 0.403700 select * from movie_db; 1
This can be Anomolous
