In [21]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import altair as alt
from wordcloud import WordCloud
from wordcloud import ImageColorGenerator
import matplotlib.pyplot as plt

## Load data

In [22]:

df = pd.read_csv('../data/BGL_cleaned.csv')

In [23]:
df.head()

Unnamed: 0,id,time,unknown1,exact_time,unknown2,info,label
0,1117838570,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.50.363779,R02-M1-N0-C:J12-U11,ras kernel info instruction cache parity error...,Normal
1,1117838570,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.50.527847,R02-M1-N0-C:J12-U11,ras kernel info instruction cache parity error...,Normal
2,1117838570,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.50.675872,R02-M1-N0-C:J12-U11,ras kernel info instruction cache parity error...,Normal
3,1117838570,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.50.823719,R02-M1-N0-C:J12-U11,ras kernel info instruction cache parity error...,Normal
4,1117838570,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.50.982731,R02-M1-N0-C:J12-U11,ras kernel info instruction cache parity error...,Normal


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4747963 entries, 0 to 4747962
Data columns (total 7 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   id          int64 
 1   time        object
 2   unknown1    object
 3   exact_time  object
 4   unknown2    object
 5   info        object
 6   label       object
dtypes: int64(1), object(6)
memory usage: 253.6+ MB


## Class proportion

In [25]:
df["label"].value_counts(normalize=True)

Normal     0.926609
Anomaly    0.073391
Name: label, dtype: float64

In [26]:
# count for each labels
count = df['label'].value_counts().tolist()
labels = columns=df['label'].unique()
count_labels = pd.DataFrame({'labels': labels, 'count': count})

# bar plot for each label
bars = alt.Chart(count_labels).mark_bar().encode(
    x='count:Q',
    y="labels:O"
)

text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3  # Nudges text to right so it doesn't appear on top of the bar
).encode(
    text='count:Q'
)

(bars + text).properties(height=200)

## Bag of Words (BOW)

In [27]:
vectorizer = CountVectorizer(max_features=200)
X_counts = vectorizer.fit_transform(df['info'])

In [28]:
# Extracting BOW features 
bow_df = pd.DataFrame(
    X_counts.toarray(), columns=vectorizer.get_feature_names_out(), index=df["label"]
)

In [None]:
count_df = bow_df.reset_index().groupby('label').sum()
count_df.head()

In [None]:
plot_anomaly = count_df.T.reset_index().sort_values("Anomaly", ascending=False).iloc[0:40,]
plot_normal = count_df.T.reset_index().sort_values("Normal", ascending=False).iloc[0:40,]

a = alt.Chart(plot_anomaly).mark_bar().encode(
    x=alt.X('index:N', sort='-y', title = 'key words'),
    y= alt.Y('Anomaly:Q',scale=alt.Scale(domain=[0, 4500000]))
).properties(width=600,height=300)
n = alt.Chart(plot_normal).mark_bar().encode(
    x=alt.X('index:N', sort='-y',title = 'key words'),
    y= alt.Y('Normal:Q')
).properties(width=600,height=300)

a | n

In [None]:
anomaly = df[df['label'] == 'Anomaly']
normal = df[df['label'] == 'Normal']

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16,15))
text = " ".join(str(i) for i in anomaly['info'])

wordcloud = WordCloud(background_color="white").generate(text)
axes[0].imshow(wordcloud, interpolation='bilinear')
axes[0].axis("off")
axes[0].set_title("Anomaly")

text = " ".join(str(i) for i in normal['info'])
wordcloud = WordCloud(background_color="white").generate(text)
axes[1].imshow(wordcloud, interpolation='bilinear')
axes[1].axis("off")
axes[1].set_title("Normal")
fig.tight_layout()