In [None]:
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
#%matplotlib notebook
from matplotlib import font_manager, rc
import scipy.cluster.hierarchy as spc
font_name = font_manager.FontProperties(fname="/usr/share/fonts/nanum/nanumGothic.ttf").get_name()
plt.rc('font', family=font_name)
plt.rc('style')
import warnings
warnings.filterwarnings(action='ignore') 
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_colwidth=3000

In [None]:
from IPython.utils.path import get_ipython_dir
print(get_ipython_dir())

In [None]:
import plotly.plotly as py
from plotly.graph_objs import *
py.sign_in('kyoh', 'xLxYyOTECJ48ofwzrn2j')
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [None]:
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

In [None]:
import os
file_list =os.listdir('../busan/DATA')

## 로드 데이터

In [None]:
data_list = []
for i in file_list:
    print(i)
    data = pd.read_csv('../busan/DATA/'+i, encoding='UTF-8',sep="\|\|",header=None,error_bad_lines=False)
    data_list.append(data)


In [None]:
data = pd.concat(data_list ,axis=0)

## 1. 알람 데이터 로드 및 전처리

In [None]:
data.columns = ["ALARM_LEVEL","ALARM_NAME","CTIME","DTIME","HOSTNAME","PATH1","PATH2","PATH3","PATH4","PATH5","PATH6","PATH7","NAME","CONDITIONLOGTEXT","CURRENTALARMSTATUS"]

In [None]:
#컬럼 이름 수정
data.columns = ['degree', 'alarm_name','time','clear_time','hostname','group1','group2','group3','group4','group5','group6','group7','resource_name','condition','status']

In [None]:
#추가 컬럼 생성
data.degree = data.degree.astype(str)
data['time'] = pd.to_datetime(data['time'], format='%Y-%m-%d %H:%M:%S')
data['clear_time'] = pd.to_datetime(data['clear_time'], format='%Y-%m-%d %H:%M:%S')
data['last_time'] = data['clear_time'] - data['time']
data['last_time'] = data['last_time'].dt.seconds
data['last_time'] = data['last_time'].fillna(0)
data['time_month'] = data['time'].dt.month
data['time_day'] = data['time'].dt.day
data['time_hour'] = data['time'].dt.hour
data['time_date']= data['time'].dt.strftime('%Y-%m-%d')
data['time_date_hour']= data['time'].dt.strftime('%Y-%m-%d %H')
data['time_weekday'] = data['time'].dt.weekday
data['time_week'] = data['time'].dt.week

In [None]:
data['group'] = data.group1.fillna('')+'>'+data.group2.fillna('')+'>'+data.group3.fillna('')+'>'+data.group4.fillna('')+'>'+data.group5.fillna('')+'>'+data.group6.fillna('')+'>'+data.group7.fillna('')

In [None]:
data['group'] = data['group'].str.replace('>>','')
data['group'] = data['group'].str.replace('>>>','')
data['group'] = data['group'].str.replace('>>>>','')

In [None]:
data['group'] = data.group.str.replace('>부산은행>','')
data['group'] = data.group.str.replace('부산은행>','')

In [None]:
data['group1'] = data['group'].str.split('>', expand=True)[0]
data['group2'] = data['group'].str.split('>', expand=True)[1]
data['group3'] = data['group'].str.split('>', expand=True)[2]
data['group4'] = data['group'].str.split('>', expand=True)[3]
data['group5'] = data['group'].str.split('>', expand=True)[4]
data['group6'] = data['group'].str.split('>', expand=True)[5]

In [None]:
data = data[data.degree.isin(['1','2','3'])==True]

In [None]:
data.degree = data.degree.replace({"1":"주의","2":"경고","3":"심각"})

### 2. 알람 발생 현황 분석

#### (1) 전체 알람 데이터 현황

In [None]:
data_after = data[(data.time_date > '2018-04-01')]
data_before = data[(data.time_date <= '2018-04-01')]
data_after_label = '2018년 4월 2일 ~ 2019년 4월 18일'
data_before_label = '2017년 4월 1일 ~ 2018년 4월 1일'
display('총 데이터: {}건'.format(len(data)),'{}: {}건'.format(data_after_label,len(data_after)),'{}: {}건'.format(data_before_label,len(data_before)))

In [None]:
data_after.to_csv('2018_alarm_total_csv',encoding=False)

In [None]:
data_before['group2'] = data_before['group2'].fillna('')
data_after['group2'] = data_after['group2'].fillna('')

In [None]:
data_log_before = data_before[data_before.condition.str.contains('이벤트 탐지') == True]
data_perf_before = data_before[data_before.condition.str.contains('이벤트 탐지') == False]
display('2017년 4월 1일 ~ 2018년 4월 1일','성능 데이터: {}건'.format(len(data_perf_before)),'로그 데이터: {}건'.format(len(data_log_before)))

In [None]:
data_log = data_after[data_after.condition.str.contains('이벤트 탐지') == True]
data_perf = data_after[data_after.condition.str.contains('이벤트 탐지') == False]
display('2018년 4월 2일 ~ 2019년 4월 18일','성능 데이터: {}건'.format(len(data_perf)),'로그 데이터: {}건'.format(len(data_log)))

In [None]:
def diff_data(group,alarm_name,degree):
    if degree == '전체':
        data_2017 =data_perf_before[(data_perf_before['group2'].str.contains(group)==True)&(data_perf_before['alarm_name'].str.contains(alarm_name)==True)].groupby(['hostname'])['time'].count().reindex().reset_index().sort_values('time',ascending=False)
        data_2018 = data_perf[(data_perf['group2'].str.contains(group)==True)&(data_perf['alarm_name'].str.contains(alarm_name)==True)].groupby(['hostname'])['time'].count().reindex().reset_index().sort_values('time',ascending=False)
    else:
        data_2017 =data_perf_before[(data_perf_before['group2'].str.contains(group)==True)&(data_perf_before['alarm_name'].str.contains(alarm_name)==True)&(data_perf_before['degree'].str.contains(degree)==True)].groupby(['hostname'])['time'].count().reindex().reset_index().sort_values('time',ascending=False)
        data_2018 =data_perf[(data_perf['group2'].str.contains(group)==True)&(data_perf['alarm_name'].str.contains(alarm_name)==True)&(data_perf['degree'].str.contains(degree)==True)].groupby(['hostname'])['time'].count().reindex().reset_index().sort_values('time',ascending=False)
    total_data = pd.merge(data_2018,data_2017,how='outer',on='hostname')
    #display(total_data)
    total_data = total_data.fillna(0)
    total_data['diff'] = total_data['time_x'] - total_data['time_y'] 
    total_data.columns = ['hostname','2018_{}'.format(degree),'2017_{}'.format(degree),'diff']
    return(total_data.sort_values('diff',ascending=False))

### 외환','Disk I/O 처리율','전체'

In [None]:
diff_data('외환','Disk I/O 처리율','전체')

### 외환','Disk I/O 처리율','등급별'

In [None]:
display_side_by_side(diff_data('외환','Disk I/O 처리율','심각'),diff_data('외환','Disk I/O 처리율','경고'),diff_data('외환','Disk I/O 처리율','주의'))

### 외환','Memory 사용률','전체'

In [None]:
diff_data('외환','Memory 사용률','전체')

### 외환','Memory 사용률','등급별'

In [None]:
display_side_by_side(diff_data('외환','Memory 사용률','심각'),diff_data('외환','Memory 사용률','경고'),diff_data('외환','Memory 사용률','주의'))

### CRM','Memory 사용률','전체'

In [None]:
diff_data('CRM','Memorhttp://192.168.200.95:8888/notebooks/busan/busan-2.ipynb#CRM','Memory-%EC%82%AC%EC%9A%A9%EB%A5%A0','%EC%A0%84%EC%B2%B4'y 사용률','전체')

### CRM','Memory 사용률','등급별'

In [None]:
display_side_by_side(diff_data('CRM','Memory 사용률','심각'),diff_data('CRM','Memory 사용률','경고'),diff_data('CRM','Memory 사용률','주의'))

In [None]:
diff_data('CRM','Memory 사용률','전체').head().hostname

In [None]:
#clusters_new = list(diff_data('외환','Disk I/O 처리율','전체').head().hostname)
clusters_new = ['SCFEM002R']
df = data_perf[(data_perf.alarm_name == 'Disk I/O 처리율') & (data_perf['hostname'].isin(clusters_new))]
condition  = df.condition.str.split(' ', expand=True)
print(condition)
df['threshold'] = condition[6].str.extract('(\d+(?:\.\d+)?)').astype(float)
df['current'] = condition[3].str.extract('(\d+(?:\.\d+)?)').astype(float)
print(df.threshold.unique())
df = df.groupby(['time','hostname','degree'])['current'].mean().reindex().reset_index()
for node in clusters_new:
    print(node)
    df_2 = df[df['hostname'] == node]
    df_2 = df_2.pivot_table(values='current', index=df_2.time, columns='degree', aggfunc='first',fill_value=(np.NaN)).reset_index()
    df_2['merge'] = df.sum(axis=1)
    #df_2 = df_2[['time','주의','경고','심각','merge']]
    display(df_2)
    #df_2.to_excel('{}.xlsx'.format(node),encoding='UTF-8')
    #df_2 = pd.melt(df_2,id_vars=['time'],value_vars=['주의','경고','심각'])
    #display(df_2)

In [None]:
def make_class(hostname):
    df = pd.DataFrame(data=((data_perf[(data_perf.alarm_name == 'Disk I/O 처리율') & (data_perf['hostname'].isin([hostname]))].sort_values('time',ascending=True).time.shift(-1) - data_perf[(data_perf.alarm_name == 'Disk I/O 처리율') & (data_perf['hostname'].isin([hostname]))].sort_values('time',ascending=True).time).fillna(0).dt.total_seconds()/60))
    bin_range = np.arange(0, df['time'].max(), 60)
    out, bins  = pd.cut(df[df['time'] <df['time'].max()].time, bins=bin_range, include_lowest=True, right=False, retbins=True)
    #out.value_counts(sort=False).plot.bar()
    print(out.value_counts(sort=False))

In [None]:
def make_class_every_10(hostname):
    df = pd.DataFrame(data=((data_perf[(data_perf.alarm_name == 'Disk I/O 처리율') & (data_perf['hostname'].isin([hostname]))].sort_values('time',ascending=True).time.shift(-1) - data_perf[(data_perf.alarm_name == 'Disk I/O 처리율') & (data_perf['hostname'].isin([hostname]))].sort_values('time',ascending=True).time).fillna(0).dt.total_seconds()/60))
    bin_range = np.arange(0, df['time'].max(), 10)
    out, bins  = pd.cut(df[df['time'] <df['time'].max()].time, bins=bin_range, include_lowest=True, right=False, retbins=True)
    #out.value_counts(sort=False).plot.bar()
    print(out.value_counts(sort=False))

In [None]:
def make_class_2(hostname):
    df = pd.DataFrame(data=((data_perf[(data_perf.alarm_name == 'Disk I/O 처리율') & (data_perf['hostname'].isin([hostname]))].sort_values('time',ascending=True).time.shift(-1) - data_perf[(data_perf.alarm_name == 'Disk I/O 처리율') & (data_perf['hostname'].isin([hostname]))].sort_values('time',ascending=True).time).fillna(0).dt.total_seconds()/60))
    bin_range = np.arange(0, 130, 10)
    out, bins  = pd.cut(df[df['time'] <130].time, bins=bin_range, include_lowest=True, right=False, retbins=True)
    #out.value_counts(sort=False).plot.bar()
    print(out.value_counts(sort=False))

In [None]:
def make_class_lasttime(hostname):
    df = pd.DataFrame(data_perf[(data_perf.alarm_name == 'Disk I/O 처리율') & (data_perf['hostname'].isin([hostname]))].last_time/60)
    bin_range = np.arange(0, df['last_time'].max(), 5)
    out, bins  = pd.cut(df[df['last_time'] <df['last_time'].max()].last_time, bins=bin_range, include_lowest=True, right=False, retbins=True)
    #out.value_counts(sort=False).plot.bar()
    print(out.value_counts(sort=False))

In [None]:
def make_class_pef(hostname):
    bin_range = np.arange(df_2['merge'].min(), df_2['merge'].max()+1, 1)
    out, bins  = pd.cut(df_2[df_2['merge'] <df_2['merge'].max()+1]['merge'], bins=bin_range, include_lowest=True, right=False, retbins=True)
    #out.value_counts(sort=False).plot.bar()
    print(out.value_counts(sort=False))

In [None]:
def make_class_100_pef(hostname):
    df = pd.DataFrame(data_perf[(data_perf.alarm_name == 'Disk I/O 처리율') & (data_perf['hostname'].isin([hostname])) & (data_perf['condition'].str.contains('100'))].last_time/60)
    bin_range = np.arange(0, df['last_time'].max(), 10)
    out, bins  = pd.cut(df[df['last_time'] <df['last_time'].max()].last_time, bins=bin_range, include_lowest=True, right=False, retbins=True)
    #out.value_counts(sort=False).plot.bar()
    print(out.value_counts(sort=False))

In [None]:
def make_class_all(hostname):
    print('## 계급구간 (발생주기)')
    make_class(hostname)
    print('## 계급구간 (발생주기 10분단위)')
    make_class_every_10(hostname)
    print('## 지속시간 기준')
    make_class_lasttime(hostname)
    print('## 성능데이터 기준')
    make_class_pef(hostname)
    print('## 디스크 I/O 100% 지속시간')
    make_class_100_pef(hostname)

In [None]:
make_class_2('SCFEM002R')

In [None]:
make_class_all('SCFEM002R')

In [None]:
make_class_all('SCFEM431R')

In [None]:
make_class_all('SCFEM001R')

In [None]:
make_class_all('SCFEM421R')

In [None]:
data_perf['time_min'] = data_perf['time'].dt.strftime('%Y-%m-%d %H:%M')

In [None]:
corrmat = data_perf[data_perf.hostname.str.contains('SCF')].groupby(['time_min','alarm_name'])['condition'].count().reindex().reset_index().pivot_table(values='condition', index='time_min', columns='alarm_name', aggfunc='first',fill_value=(0)).corr()

In [None]:
clusters = []
for node in corrmat.columns:
    clusters.append(corrmat[corrmat[node] >= 0.2].index)
clusters_new = []
for cluster in clusters:
    if len(cluster) > 1:
        clusters_new.append(list(cluster))
clusters_new = list(set([tuple(set(node)) for node in clusters_new]))

dictt = {}
for i in range(0,len(clusters_new)):
    dictt.update({'cluster{}'.format(i) : list(clusters_new[i])})

#data_df_date = data_df_date.reset_index()
#pd.concat([pd.Series(clusters_new[0],name='cluster0'),pd.Series(clusters_new[1],name='cluster1')], axis=1)
#for i in range(2,len(clusters_new)):
#    df = pd.concat([df,pd.Series(clusters_new[i],name='cluster{}'.format(i))], axis=1)
#df.fillna('',inplace=True)

In [None]:
clusters_new

In [None]:
test=data_perf.groupby(['time_date'])['time'].count().reindex().reset_index()

In [None]:
from datetime import datetime, timedelta
pd.to_datetime(test[test['time'] > 2000].time_date, format='%Y-%m-%d') - timedelta(3)

In [None]:
date = pd.to_datetime(test[test['time'] > 2000].time_date, format='%Y-%m-%d')
for i in date:
    before = (i - timedelta(3)).strftime('%Y-%m-%d')
    i = i.strftime('%Y-%m-%d')
    print(i)
    df = data_perf[data_perf['time_date']==i]
    df_2 = data_perf[data_perf['time_date']==before]
    #display(df)
    df = df.groupby(['alarm_name'])['time'].count().reindex().reset_index().sort_values('time',ascending=False)
    df.columns = ['alarm_name','now']
    df_2 = df_2.groupby(['alarm_name'])['time'].count().reindex().reset_index().sort_values('time',ascending=False)
    df_2.columns = ['alarm_name','before']
    df_3 =pd.merge(df,df_2,how='outer',on='alarm_name')
    df_3['diff'] = df_3['now'] - df_3['before']
    display(df_3.sort_values('diff',ascending=False))
    

In [None]:
test[test['time'] > 2000]['time']

In [None]:
data_perf[(data_perf.hostname=='SCATD002')&(data_perf.alarm_name=='Memory 사용률')].to_excel('20190618.xlsx',encoding='UTF-8')

In [None]:
def make_scatter(df,name):
    import plotly.graph_objs as go
    df = data_perf[data_perf.alarm_name==name].groupby(['time','hostname'])['condition'].count().reindex().reset_index().pivot_table(values='condition', index='time', columns='hostname', aggfunc='first',fill_value=0).reset_index().describe()
    trace = go.Scatter(
    x = df.iloc[1].values,
    y = df.iloc[2].values,
    mode = 'markers',
    text= df.columns,
    textposition = 'top right'
    )
    
    plot = [trace]
    
    # Plot and embed in ipython notebook!
    py.iplot(plot)

In [None]:
df = data_after.groupby(['time','alarm_name'])['condition'].count().reindex().reset_index().pivot_table(values='condition', index='time', columns='alarm_name', aggfunc='first',fill_value=0).reset_index().describe()
df_2 = data_after.groupby(['time','hostname'])['condition'].count().reset_index().reindex().pivot_table(values='condition', index='time', columns='hostname', aggfunc='first',fill_value=0).reset_index().describe()
df_3 = data_after[data_after.degree=="주의"].groupby(['time_date','time'])['condition'].count().reset_index().reindex().pivot_table(values='condition', index='time', columns='time_date', aggfunc='first',fill_value=0).reset_index().describe()
df_4 = data_after[data_after.degree=="경고"].groupby(['time_date','time'])['condition'].count().reset_index().reindex().pivot_table(values='condition', index='time', columns='time_date', aggfunc='first',fill_value=0).reset_index().describe()
df_5 = data_after[data_after.degree=="심각"].groupby(['time_date','time'])['condition'].count().reset_index().reindex().pivot_table(values='condition', index='time', columns='time_date', aggfunc='first',fill_value=0).reset_index().describe()

In [None]:
import plotly.graph_objs as go
trace5 = go.Scatter(
    name="심각",
    x = df_5.iloc[1].values,
    y = df_5.iloc[2].values,
    mode='markers+text',
    text= df_5.columns,
    textposition='top left'
)
trace4 = go.Scatter(
    name="경고",
    x = df_4.iloc[1].values,
    y = df_4.iloc[2].values,
    mode='markers+text',
    text= df_4.columns,
    textposition='top left'
)
trace3 = go.Scatter(
    name="주의",
    x = df_3.iloc[1].values,
    y = df_3.iloc[2].values,
    mode='markers+text',
    text= df_3.columns,
    textposition='top left'
)
trace2 = go.Scatter(
    name="system",
    x = df_2.iloc[1].values,
    y = df_2.iloc[2].values,
    mode='markers+text',
    text= df_2.columns,
    textposition='top left'
)
trace = go.Scatter(
    name="alarm",
    x = df.iloc[1].values,
    y = df.iloc[2].values,
    mode='markers+text',
    text= df.columns,
    textposition='top left'
)

plot = [trace,trace2,trace3,trace4,trace5]

# Plot and embed in ipython notebook!
py.iplot(plot)

In [None]:
df_2.iloc[1].values.mean()

In [None]:
df_2.iloc[2].values.mean()

In [None]:
data_perf['host_alarm'] = data_perf.hostname+'_'+data_perf.alarm_name

In [None]:
df_3 = data_perf.groupby(['time_date_hour','host_alarm'])['condition'].count().reindex().reset_index().pivot_table(values='condition',index='time_date_hour',columns='host_alarm',aggfunc='first',fill_value=0).reset_index().describe()

In [None]:
trace = go.Scatter(
    name="tt",
    x = df_3.iloc[1].values,
    y = df_3.iloc[2].values,
    mode='markers+text',
    text= df_3.columns,
    textposition='top left'
)
plot = [trace]

# Plot and embed in ipython notebook!
py.iplot(plot)

In [None]:
df_3 = data_perf.groupby(['time_date_hour','host_alarm'])['condition'].count().reindex().reset_index().pivot_table(values='condition',index='time_date_hour',columns='host_alarm',aggfunc='first',fill_value=0).reset_index()

In [None]:
corrmat = df_3.corr()

In [None]:
clusters = []
for node in corrmat.columns:
    clusters.append(corrmat[corrmat[node] >= 0.9].index)
clusters_new = []
for cluster in clusters:
    if len(cluster) > 1:
        clusters_new.append(list(cluster))
clusters_new = list(set([tuple(set(node)) for node in clusters_new]))

dictt = {}
for i in range(0,len(clusters_new)):
    dictt.update({'cluster{}'.format(i) : list(clusters_new[i])})

#data_df_date = data_df_date.reset_index()
#pd.concat([pd.Series(clusters_new[0],name='cluster0'),pd.Series(clusters_new[1],name='cluster1')], axis=1)
#for i in range(2,len(clusters_new)):
#    df = pd.concat([df,pd.Series(clusters_new[i],name='cluster{}'.format(i))], axis=1)
#df.fillna('',inplace=True)

In [None]:
len(clusters_new)

In [None]:
clusters_new