# Parse manager.log file
This Jupyter Notebook allows parsing log files, looking for similar log messages over time and generating a plot of key name/labels that vary per log entry.  This generates a scatter plot.

Hopefully this can be used as a template for further log file visualization.


In [6]:
!which python3.8
!python3.8 -m pip install pandas
!python3.8 -m pip install plotly

/usr/bin/python3.8
Collecting plotly
  Downloading plotly-4.12.0-py2.py3-none-any.whl (13.1 MB)
[K     |████████████████████████████████| 13.1 MB 11.1 MB/s 
Collecting retrying>=1.3.3
  Downloading retrying-1.3.3.tar.gz (10 kB)
Building wheels for collected packages: retrying
  Building wheel for retrying (setup.py) ... [?25ldone
[?25h  Created wheel for retrying: filename=retrying-1.3.3-py3-none-any.whl size=11430 sha256=5a0eeecfbc2d3d58722c7863960355ca1c85dbad35d1d2cb39ef16c510d4b54d
  Stored in directory: /home/alpiepho/.cache/pip/wheels/c4/a7/48/0a434133f6d56e878ca511c0e6c38326907c0792f67b476e56
Successfully built retrying
Installing collected packages: retrying, plotly
Successfully installed plotly-4.12.0 retrying-1.3.3


In [7]:
# convert log file to pandas dataframe
import pandas as pd
from io import StringIO

filename = 'manager.log'

# 2020-11-03 21:15:03,856 - WARNING - 43883 - Stats Poll: MRUTDNTLDNS002: A failure in statistics polling resulted in NO DATA being returned for:
# 0          1              3                             9
pattern = 'A failure in statistics polling resulted in NO DATA'
pattern_filter_pre = 'Stats Poll: '
pattern_filter_post = ': A failure in statistics polling resulted in NO DATA being returned for:'
# use python loop to filter matching lines
text = []
with open(filename) as f:
    for line in f:
        if pattern in line:            
            text.append(line)
text = "".join(text)

# use pandas to parse matching lines
df = pd.read_csv(StringIO(text), sep=" - ", names=['Time','Level','PID', 'Text'], engine='python')
df.Time = pd.to_datetime(df.Time, format='%Y-%m-%d %H:%M:%S,%f')
df.Text = df.Text.str.replace(pattern_filter_pre, '')
df.Text = df.Text.str.replace(pattern_filter_post, '')
#DEBUG
# print(len(text))
# print (df.dtypes)
# df.info()

In [8]:
# find unique list of name/labels
all_labels = df.Text.to_list()
all_labels = list(set(labels))
all_labels.sort()
for label in all_labels:
    print(label)
print(len(all_labels))

NameError: name 'labels' is not defined

In [9]:
# adjust labels to view
select_labels = all_labels
# select_labels = { \
#     'AGRALHMDDNS004', \
#     'AGRALHMDDNS005', \
#     'AGRALHMDDNS006', \
#     'AMBAPMNRDNS007', \
#     'AMBAPMNRDNS009', \
#     'AMBDAMDBDNS007', \
#     'AMBDAMDBDNS008', \
#     'AMBDAMDBDNS009', \
#     'AMBDAMDBDNS010', \
#     'AMBDJSBWDNS001', \
#     'AMBDJSBWDNS002', \
#     'AMBDJSBWDNS003',
# }
for label in select_labels:
    print(label)
print(len(all_labels))

NVMBRCPGDNS025
CHNNTFKRDNS007
AMBDAMDBDNS009
CHNNTFKRDNS007
BHPLSVDHDNS001
LDHNCHPTDNS002
MRUTDNTLDNS002
CHNNTFKRDNS007
NOIDNDFDDNS018
CHNNTFKRDNS007
MRUTDNTLDNS002
MRUTDNTLDNS002
CHNNTFKRDNS007
PTNARKPRDNS005
KNPRCKDTDNS001
LDHNCHPTDNS003
CHNNTFKRDNS007
MRUTDNTLDNS003
MDURPDNNDNS001
KNPRCKDTDNS003
CHNNTFKRDNS007
PTNARJPLDNS001
CHNNTFKRDNS007
MRUTDNTLDNS003
MRUTDNTLDNS003
CHNNTFKRDNS007
BHPLSVDHDNS002
MRUTDNTLDNS003
CHNNTFKRDNS007
MRUTDNTLDNS003
CHNNTFKRDNS007
MRUTDNTLDNS003
CHNNTFKRDNS007
AMBDJSBWDNS003
MRUTDNTLDNS003
MDURPDNNDNS002
CHNNTFKRDNS007
PTNARKPRDNS007
PTNARJPLDNS004
NGPRSNGNDNS008
BHPLSVDHDNS001
MRUTDNTLDNS003
CHNNTFKRDNS007
BBSRBDGLDNS005
MOUDRAHDDNS003
MRUTDNTLDNS003
CHNNTFKRDNS007
DLHIVKDEDNS008
MRUTDNTLDNS003
CHNNTFKRDNS007
MRUTDNTLDNS003
CHNNTFKRDNS007
DLHIVKDEDNS009
AMBDAMDBDNS008
MRUTDNTLDNS003
KNPRCKDTDNS002
CHNNTFKRDNS007
AMBDAMDBDNS007
MRUTDNTLDNS003
CHNNTFKRDNS007
MRUTDNTLDNS003
CHNNTFKRDNS007
MRUTDNTLDNS003
CHNNTFKRDNS007
MRUTDNTLDNS003
CHNNTFKRDNS007
PTNARKPRDN

In [10]:
# filter only labels
df2 = df[df['Text'].isin(select_labels)]
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 336 entries, 0 to 335
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Time    336 non-null    datetime64[ns]
 1   Level   336 non-null    object        
 2   PID     336 non-null    int64         
 3   Text    336 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 13.1+ KB


In [11]:
# set plot x and y series
xdata = df2['Time']
ydata = df2['Text']


In [70]:
# plot the data
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=xdata,
    y=ydata,
    marker=dict(color="crimson", size=5),
    mode="markers",
    name="Server",
))

fig.update_layout(title="Log Pattern\n '%s'" % pattern,
                  xaxis_title="Date",
                  yaxis_title="Server",
                  width=1000,
                  height=800)

fig.show()