In [3]:
from google.colab import drive 
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [4]:
!pip install pydot --quiet
!pip install gensim==3.8.3 --quiet
!pip install tensorflow-datasets --quiet
!pip install -U tensorflow-text==2.8.2 --quiet
!pip install transformers --quiet

[K     |████████████████████████████████| 24.2 MB 1.1 MB/s 
[K     |████████████████████████████████| 4.9 MB 15.3 MB/s 
[K     |████████████████████████████████| 497.9 MB 28 kB/s 
[K     |████████████████████████████████| 1.4 MB 52.4 MB/s 
[K     |████████████████████████████████| 5.8 MB 75.3 MB/s 
[K     |████████████████████████████████| 462 kB 83.4 MB/s 
[K     |████████████████████████████████| 5.5 MB 15.2 MB/s 
[K     |████████████████████████████████| 163 kB 66.6 MB/s 
[K     |████████████████████████████████| 7.6 MB 56.9 MB/s 
[?25h

In [5]:
import pandas as pd
import numpy as np
import sys
import os
import re

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds
import tensorflow_text as tf_text

from transformers import BertTokenizer, TFBertModel


import sklearn as sk

import nltk
from nltk.data import find

import matplotlib.pyplot as plt



#This continues to work with gensim 3.8.3.  It doesn't yet work with 4.x.  
#Make sure your pip install command specifies gensim==3.8.3
import gensim

In [6]:
os.chdir('/content/gdrive/MyDrive/W266_Final_Project/Tbird parsed/')

In [7]:
df=pd.read_csv('clean_log.csv')

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,Content1,Label1,EventId
0,0,in.tftpd[] tftp client does not accept options,0,ba463c69
1,1,postfix postdrop[] warning unable to look up p...,0,85f57867
2,2,postfix postdrop[] warning unable to look up p...,0,85f57867
3,3,postfix postdrop[] warning unable to look up p...,0,85f57867
4,4,postfix postdrop[] warning unable to look up p...,0,85f57867


In [9]:
nltk.download('word2vec_sample')

word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))

model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

[nltk_data] Downloading package word2vec_sample to /root/nltk_data...
[nltk_data]   Unzipping models/word2vec_sample.zip.


In [10]:
whole_data_emb = df["Content1"].values.tolist()
whole_data_label=df["Label1"].values.tolist()

In [11]:
len(whole_data_emb)

5000000

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(whole_data_emb, whole_data_label, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [13]:
#@title Embedding Matrix Creation

EMBEDDING_DIM = len(model['university'])      # we know... it's 300

# initialize embedding matrix and word-to-id map:
embedding_matrix = np.zeros((len(model.vocab.keys()) + 1, EMBEDDING_DIM))       
vocab_dict = {}

# build the embedding matrix and the word-to-id map:
for i, word in enumerate(model.vocab.keys()):
    embedding_vector = model[word]
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        vocab_dict[word] = i

# we can use the last index at the end of the vocab for unknown tokens
vocab_dict['[UNK]'] = len(vocab_dict)

In [14]:
SEQUENCE_LENGTH = 100

In [42]:
def docs_to_vocab_ids(tokenized_texts_list):
    """
    converting a list of strings to a list of lists of word ids
    """
    texts_vocab_ids = []
    text_labels = []
    valid_example_list = []
    for i, token_list in enumerate(tokenized_texts_list):

        # Get the vocab id for each token in this doc ([UNK] if not in vocab)
        vocab_ids = []
        for token in list(token_list.numpy()):
            decoded = token.decode('utf-8', errors='ignore')
            if decoded in vocab_dict:
                vocab_ids.append(vocab_dict[decoded])
            else:
                vocab_ids.append(vocab_dict['[UNK]'])
            
        # Truncate text to max length, add padding up to max length
        vocab_ids = vocab_ids[:SEQUENCE_LENGTH]
        n_padding = (SEQUENCE_LENGTH - len(vocab_ids))
        # For simplicity in this model, we'll just pad with unknown tokens
        vocab_ids += [vocab_dict['[UNK]']] * n_padding
        valid_example_list.append(i)
        # Add this example to the list of converted docs
        texts_vocab_ids.append(vocab_ids)
            
        if i % 5000 == 0:
            print('Examples processed: ', i)

    print('Total examples: ', i)
    return (np.array(texts_vocab_ids), valid_example_list)

In [37]:
tokenizer = tf_text.WhitespaceTokenizer()
train_tokens = tokenizer.tokenize(X_train)
                            

In [None]:
train_valid_example_list = docs_to_vocab_ids(train_tokens)

In [47]:
X_train[0:3]

['ib sm.x[] [INFO] Configuration caused by some ports in INIT state',
 'kernel [KERNEL IB][ib mad dispatch][ mnt projects sysapps src ib topspin topspin-src-..- ib ts api ng mad obj host amd custom rhel ts ib mad mad filter.c]mad process failed () for InfiniHost port  QPN  (class  aid )',
 'sshd(pam unix)[] session opened for user root by (uid)']

In [15]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [16]:
num_train_examples = 250000
num_test_examples = 50000

max_length = 100


x_train = bert_tokenizer(X_train[:num_train_examples], 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')
y_train = y_train[:num_train_examples]



def select_min_length_examples(x_data, y_data):

  x_input_ids = []
  y_labels = []

  for ((input_ids, masks), label) in zip(zip(x_data['input_ids'], x_data['attention_mask']), y_data):
    if masks[-1] == 1:
      x_input_ids.append(input_ids)
      y_labels.append(label)

  return np.array(x_input_ids), np.array(y_labels) 


In [18]:
x_train

{'input_ids': <tf.Tensor: shape=(250000, 100), dtype=int32, numpy=
array([[  101,   178,  1830, ...,     0,     0,     0],
       [  101, 18670,   164, ...,     0,     0,     0],
       [  101,   188,  2737, ...,     0,     0,     0],
       ...,
       [  101, 18950, 19483, ...,     0,     0,     0],
       [  101, 18670,   164, ..., 11239,   142,   102],
       [  101, 23875,   193, ...,     0,     0,     0]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(250000, 100), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(250000, 100), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0]], dt