In [None]:
# -*- coding: utf-8 -*-
"""
Created on Thu Jul  7 11:01:20 2022

@author: d-carter.fornwalt
"""

import pandas as pd
from pandas import json_normalize
import json
import os
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.pyplot import figure


        
def gen_wordcloud(sub_compile, f):
    text = " ".join(log for log in sub_compile.log.astype(str))
    # generate word cloud
    wordcloud = WordCloud(stopwords = STOPWORDS, collocations=True, max_font_size=50, max_words=20, background_color="white").generate(str(text))
    #use words_ to print relative word frequencies
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.savefig((('images/'+f+'/'+f+'_wordcloud.png')), format="png")
    plt.show()

# Create breakdown of log counts by container and error type
def gen_log_counts(fin, log_types):
    
    nums = pd.DataFrame(columns=['container', 'container_total','log_class','log_class_freq'])
    for i in fin['container_name'].unique():
        sub = fin.loc[fin['container_name'] == i]
        c = list(fin['container_name']).count(i)
        for j in log_types:
            #c = list(new2me['container_name']).count(i)
            b =  list(sub['log_class']).count(j)
            d = {'container':[i],'container_total':[c],'log_class':[j],'log_class_freq':[b]}
            t = pd.DataFrame(d, columns=['container', 'container_total','log_class','log_class_freq'])
            nums = nums.append(t)
    return(nums)


def plot_log_counts(nums, f):
    # Set Up Stacked Bars
    
    Container = nums['container'].unique()
    Error = nums['log_class_freq'].loc[nums['log_class'] == 'Error']
    Failure = nums['log_class_freq'].loc[nums['log_class'] == 'Failure']
    HealthCheck = nums['log_class_freq'].loc[nums['log_class'] == 'Health Check']
    InfrastructureLog = nums['log_class_freq'].loc[nums['log_class'] == 'Infrastructure']
    OtherTraffic = nums['log_class_freq'].loc[nums['log_class'] == 'Other']
    
    # Define width of stacked chart and dimensions 
    w = 0.6
    figure(figsize=(9,6))
    # Plot stacked bar chart
    
    plt.bar(Container, Error, w)
    plt.bar(Container, OtherTraffic, w, bottom=Error)
    plt.bar(Container, HealthCheck, w, bottom=OtherTraffic+Error)
    plt.bar(Container, InfrastructureLog, w, bottom=HealthCheck+OtherTraffic+Error)
    plt.bar(Container, Failure, w, bottom=InfrastructureLog+HealthCheck+OtherTraffic+Error)
    
    # Display
    plt.xlabel("Container Names")
    title = 'Log Classification Spread for Logs Dated: '+f[:-11]
    plt.title(title)
    plt.ylabel("Count of Unique Logs")
    plt.legend(["Error","Other","Health Check","Infrastructure","Failure"])
    plt.ylim(0, nums['container_total'].max()+(nums['container_total'].max()*0.1))
    fig1 = plt.gcf()
    plt.show()
    plt.draw()
    fig1.savefig('images/'+f+'/'+f+'_barchart.png')
    
    return('PNG of Chart Saved')


def label_log(fin, f):
    path = 'analyzed/'+f[:-5]
    isExist = os.path.exists(path)
    if not isExist:
        os.makedirs(path)
    clone = pd.DataFrame()
    for i in fin.iterrows():
        i = i[1].to_frame().transpose()
        log = i['log']
        if 'error' in str(log).lower():
            i['log_class'] = 'Error'
        elif 'fail' in str(log).lower():
            i['log_class'] = 'Failure'
        elif 'health' in str(log).lower():
            i['log_class'] = 'Health Check'
        elif 'datacenter' in str(log).lower():
            i['log_class'] = 'Infrastructure'
        else:
            i['log_class'] = 'Other'
        clone = pd.concat([clone, i])
    f = path+'/'+f+'_analyzed.csv'
    clone.to_csv(f)
    return(clone)

def convertJsontoDataframe(df):
    '''
    param: df - dataframe of detail columns and one large complex json
    return: dataframe with all nested json objects translated to columns
    
    This is a super ugly function to process the input dataframe, which consists
    of a few fields and one large dictionary. This was written using the available
    data, and some hardcoded fields may need changed in the future
    '''
    # skeletons
    try:
        print(len(df))
        print(df.columns)
        docks = []
        dock_sub = []
        kubes = []
        kube_sub = []
        remainder = []
        if '_source' in df.columns:
            source = zip(list(df['_id']), list(df['_source']))    
        else:
            source = zip(list(df['hits']['_id']), list(df['hits']['_source']))
            
        for h, i in source:
            # set up copy
            c = i
            c['_id'] = h
            # do docker
            if 'docker' in i.keys():
                docker = i['docker']
                docker['_id'] = h
                # look for additional json objects
                for k, v in docker.items():
                    if isinstance(v, dict):
                        a = docker[k]
                        a['_id'] = h
                        dock_sub.append(a)
                
                docks.append(docker)
                c.pop('docker')
            # time for kubernetes
            if 'kubernetes' in i.keys():
                
                kubernetes = i['kubernetes']
                kubernetes['_id'] = h
                # look for additional json objects
                for k, v in kubernetes.items():
                    if isinstance(v, dict):
                        a = kubernetes[k]
                        a['_id'] = h
                        kube_sub.append(a)
                        
                kubes.append(kubernetes)
                c.pop('kubernetes')
            #append non dicts
            remainder.append(c)
        # run all lists through compiler
        final = compileDataframes(df, remainder, kubes, kube_sub, docks, dock_sub)
    except:
        print('gave up')
    return(final)
    
    
def compileDataframes(df, remainder, kubes, kube_sub, docks, dock_sub):
    '''
    param: 
        df - original dataframe object including complex json column
        remainder - non-json objects from complex json as a list
        kubes - core kubernetes json object as list
        kube_Sub - nested objects from kubernetes object, processed separately
        docks - core docker json object as list
        dock_sub - nested objects from docker object, processed separately
    return:
        frankenstein - final dataframe with entire complex json organized as columns
        
    This is a supplementary function to convertJsontoDataframe, meant to combine
    all of the resulting dictionary to list translations. This converts lists
    consiting of dictionaries to dataframes, merged on the unique log identifier
    '''
    
    remainder = pd.DataFrame(remainder)
    kube_sub = pd.DataFrame(kube_sub)
    if len(kube_sub) != 0:
        kube_sub = kube_sub.groupby('_id').apply(lambda x : x.ffill()).drop_duplicates('_id', keep='last')
    kubes = pd.DataFrame(kubes)
    docks = pd.DataFrame(docks)
    dock_sub = pd.DataFrame(dock_sub)
    frankenstein = df.merge(remainder, on='_id')
    if len(kubes) != 0:
        frankenstein = frankenstein.merge(kubes, on='_id')
    if len(docks) != 0:
        frankenstein = frankenstein.merge(docks, on='_id')

    if len(dock_sub) != 0:
        frankenstein = frankenstein.merge(dock_sub, on='_id')
    if len(kube_sub) != 0:
        frankenstein = frankenstein.merge(kube_sub, on='_id')
    cols = ['_source', 'labels','namespace_labels']
    for column in cols:
        if column in frankenstein.columns:
            frankenstein = frankenstein.drop(columns=[column])
    
    return(frankenstein)




