# Import all Libraries

In [None]:
import numpy as np
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan

# Connect to Elastic Search

In [None]:
# create a client instance of the library
es = Elasticsearch(host='localhost', port=9200)

# Function to fetch the Ids and their Respective Messages

In [None]:
def get_data_from_elastic():
    # query: The elasticsearch query.
    query = {
    
    }
    # Scan function to get all the data. 
    response = scan(client=es,             
               query=query,                                     
               scroll='2m',
               index='*.27',
               raise_on_error=True,
               preserve_order=False,
               clear_scroll=True)

    # Keep response in a list.
    result = list(response)
    print("! ",result)
    ids =[]
    for i in range(len(result)):
        ids.append(result[i]["_id"])
    temp = []
    
    # We need only '_source', which has all the fields required.
    for hit in result:
        
        temp.append(hit['_source'])
    # Create a dataframe.
    df = pd.DataFrame(temp)

    return (ids,df)

In [None]:
lol = get_data_from_elastic()
ids = lol[0]
df = lol[1]

In [None]:
print(*ids)

In [None]:
#First 1000 data from the DataFrame
df.head(1000)

In [None]:
#Meassage Example
df['message'][1]

In [None]:
#Get the message list from the dataFrame for Text Cleaning, Tokenization,etc.
msg_list = list(df['message'])
for i in range(len(msg_list)):
    print(msg_list[i])

In [None]:
for msg in msg_list:
    #To replace \220 with another value
    msg.replace('\220','>')

# Text Cleaning

In [None]:
import re
import html

clean_msg_list=[]
for msg in msg_list:
    #various regular expressions to clean the message data
    #To clear the hyperlinks
    msg = re.sub(r'https?:\/\/.\S+', "", msg)
    #To clean the date and time from the msgs
    msg = re.sub(r'(\S\S \S\S\S )?\S\S\S( )? (\d)?\d \d\d:\d\d:\d\d (\d\d\d\d)?','',msg)
    #To remove various emailIds
    msg = re.sub(r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$','',msg)
    #To remove ... special characters from the msgs
    msg = re.sub('\...+','',msg)
    #To remove � from the msgs
    msg = re.sub(r'\S\�+','',msg)
    #To remove any substring of #%abc42 type
    msg = re.sub('\%\w+','',msg)
    #To remove the word combo from the msgs
    msg = re.sub('combo',"",msg)
    #To remove Blank Lines
    msg = re.sub(r'\n\s*\r ','',msg)
    #To remove punctuation
    msg = re.sub(r'[^\w\s]','',msg)
    #To remove digits from the messages
    msg = re.sub(r'\d+','',msg)
    #To remove any html links
    msg= html.unescape(msg)
    clean_msg_list.append(msg)

In [None]:
clean_msg_list

# Tokenization

In [None]:
#Word tokenization
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize


stop_words = list(set(stopwords.words('english')))
stop_words = stop_words+['logname', 'uid','eth','euid','PCI','node','Privacy','IRQ','ttyNODEVssh', 'ruser','rhostjwhp5','IPv','tables','Microcode', 'Driver','Hash', 'interface','family','httpd','info', 'mice','check','pass','cache','spamd','bytes','syslog','klogd','BIOS','protocol','NET','md','arrays','filesystem','CPU','user', 'cyrus','dev', 'type', 'ext3', 'uses','others','internal', 'service','already','floppy','syslogd','ip_tables','startup','use','NET', 'Registered', 'Revision','CDROM', 'drive','hda']
new_Doc=[]
for msg in clean_msg_list:
    nltk_tokens = nltk.word_tokenize(msg)
    filtered_sentence = []
    for w in nltk_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    new_Doc.append(filtered_sentence)
print(new_Doc)        

# More research on domain stop words.

# Lemmatization

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer as wl
wordnet = wl()

doc =[]
for docs in new_Doc:
    arr =[]
    for word in docs:
        arr.append(wordnet.lemmatize(word))
    doc.append(arr)
print(doc)

# Re-Formation/DeTokenization

In [None]:
pip install sacremoses

In [None]:
from sacremoses import MosesDetokenizer

parsed_logs=[]
for words in doc:
    detokens = MosesDetokenizer().detokenize(words, return_str=True)
    parsed_logs.append(detokens)

In [None]:
parsed_logs

In [None]:
print(type(df['agent'][1]))

In [None]:
textfile = open(r"C:/Users/akesar01/Documents/Logs/ParsedLogs.txt", "r+")
textfile.write('id'+","+'message'+","+'@version'+","+'@timestamp'+"\n")
for i in range(len(parsed_logs)):
    textfile.write(ids[i]+","+parsed_logs[i]+","+df['@version'][i]+","+df['@timestamp'][i]+"\n")
textfile.close()

In [None]:
es.info()

In [None]:
resp = es.get(index="filebeat-7.16.3-2022.03.10", id=ids[0])
print(resp['_source'])

In [None]:
parsed_logs[0]

In [None]:
resp = es.get(index="gamma_2", id= 'Vy7ajH8BLOYn9AgzScoh')
print(resp['_source'])

# TFIDF Differentiation


### The purpose of TF - IDF is to highlight words which are frequent  in a document but not cross document

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

text = parsed_logs

In [None]:
vectorizer = TfidfVectorizer()

In [None]:
result = vectorizer.fit_transform(text)

In [None]:
print(vectorizer.idf_)

In [None]:
print(vectorizer.vocabulary_)

In [None]:
lol = [(k, vectorizer.vocabulary_[k]) for k in vectorizer.vocabulary_]

In [None]:
sorted(lol,key = lambda x: x[1])

In [None]:
arr = result.toarray()
for i in range(len(arr)):
    for j in range(len(arr[0])):
        print(arr[i][j], end=" ")
    print()

In [None]:
print(result.toarray())

In [None]:
# summarize encoded vector
print(vector.toarray())

In [None]:
print('\nidf values:')
c=zip(vectorizer.get_feature_names(), vectorizer.idf_)
for ele1, ele2 in sorted(c,key = lambda x: x[1]):
    print(ele1, ':', ele2)