<a href="https://colab.research.google.com/github/ayushmothiya/la-la-la/blob/main/The_File.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [2]:
import pandas as pd
import numpy as np
import string
import tensorflow as tf
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [3]:
train_data = pd.read_csv('/content/drive/MyDrive/dataset/new_data.csv')
train_data.columns = ['index','input','output']
train_data.head()

Unnamed: 0,index,input,output
0,0,RAS KERNEL INFO instruction cache parity error...,normal
1,1,RAS KERNEL INFO instruction cache parity error...,normal
2,2,RAS KERNEL INFO instruction cache parity error...,normal
3,3,RAS KERNEL INFO instruction cache parity error...,normal
4,4,RAS KERNEL INFO instruction cache parity error...,normal


In [6]:
train_data['text_length'] = train_data['input'].apply(len)

train_data['msg_type'] = train_data['output'].map({'anomaly':0, 'normal':1})
msg_label = train_data['msg_type'].values
train_data.head()

Unnamed: 0,index,input,output,text_length,msg_type
0,0,RAS KERNEL INFO instruction cache parity error...,normal,57,1
1,1,RAS KERNEL INFO instruction cache parity error...,normal,57,1
2,2,RAS KERNEL INFO instruction cache parity error...,normal,57,1
3,3,RAS KERNEL INFO instruction cache parity error...,normal,57,1
4,4,RAS KERNEL INFO instruction cache parity error...,normal,57,1


In [7]:
train_data = train_data.drop_duplicates(subset=['input','msg_type'])

In [8]:
train_data['msg_type'].value_counts()

1    268768
0      2171
Name: msg_type, dtype: int64

In [9]:
train_data.shape

(270939, 5)

In [10]:
train_data = train_data.sample(frac = 1)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(train_data['input'], train_data['msg_type'], test_size=0.2, random_state=434)

In [12]:
y_test.value_counts()

1    53771
0      417
Name: msg_type, dtype: int64

In [13]:
# Defining pre-processing parameters
max_len = 50 
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>' # out of vocabulary token
vocab_size = 500

In [14]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = vocab_size, 
                      char_level = False,
                      oov_token = oov_tok)
tokenizer.fit_on_texts(x_train)

In [15]:
# Get the word_index
word_index = tokenizer.word_index
total_words = len(word_index)
total_words

189451

In [16]:
training_sequences = tokenizer.texts_to_sequences(x_train)
training_padded = tf.keras.utils.pad_sequences(training_sequences,
                                maxlen = max_len,
                                padding = padding_type,
                                truncating = trunc_type)

In [17]:
testing_sequences = tokenizer.texts_to_sequences(x_test)
testing_padded = tf.keras.utils.pad_sequences(testing_sequences,
                               maxlen = max_len,
                               padding = padding_type,
                               truncating = trunc_type)

In [18]:
print('Shape of training tensor: ', training_padded.shape)
print('Shape of testing tensor: ', testing_padded.shape)

Shape of training tensor:  (216751, 50)
Shape of testing tensor:  (54188, 50)


tpu

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Device:', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
    strategy = tf.distribute.get_strategy()
print('Number of replicas:', strategy.num_replicas_in_sync)

AUTOTUNE = tf.data.experimental.AUTOTUNE
    
print(tf.__version__)

model 1

In [37]:
# Define parameter
vocab_size = 500 
embedding_dim = 16
drop_value = 0.2
n_dense = 24
# Define Dense Model Architecture
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size,
                    embedding_dim,
                    input_length = max_len))
model.add(tf.keras.layers.GlobalAveragePooling1D())
model.add(tf.keras.layers.Dense(24, activation='relu'))
model.add(tf.keras.layers.Dropout(drop_value))
model.add(tf.keras.layers.Dense(1, activation='softmax'))

In [38]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 50, 16)            8000      
                                                                 
 global_average_pooling1d_1   (None, 16)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_2 (Dense)             (None, 24)                408       
                                                                 
 dropout_1 (Dropout)         (None, 24)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 25        
                                                                 
Total params: 8,433
Trainable params: 8,433
Non-trainable params: 0
____________________________________________________

In [39]:
from tensorflow.keras import backend as K
def custom_f1(y_true, y_pred):
    def recall_m(y_true, y_pred):
        TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        Positives = K.sum(K.round(K.clip(y_true, 0, 1)))

        recall = TP / (Positives+K.epsilon())
        return recall


    def precision_m(y_true, y_pred):
        TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        Pred_Positives = K.sum(K.round(K.clip(y_pred, 0, 1)))

        precision = TP / (Pred_Positives+K.epsilon())
        return precision

    precision, recall = precision_m(y_true, y_pred), recall_m(y_true, y_pred)

    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [40]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam' , metrics = ['accuracy'])

In [41]:
with tf.device('/device:GPU:0'):
  num_epochs = 3
  early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
  history = model.fit(training_padded,
                    y_train,
                    epochs=num_epochs, 
                    validation_data=(testing_padded, y_test),
                    callbacks =[early_stop], 
                    verbose=2)

Epoch 1/3
6774/6774 - 27s - loss: 0.0132 - accuracy: 0.9919 - val_loss: 2.6940e-06 - val_accuracy: 0.9923 - 27s/epoch - 4ms/step
Epoch 2/3
6774/6774 - 24s - loss: 3.4420e-04 - accuracy: 0.9919 - val_loss: 6.4927e-07 - val_accuracy: 0.9923 - 24s/epoch - 4ms/step
Epoch 3/3
6774/6774 - 24s - loss: 2.8553e-04 - accuracy: 0.9919 - val_loss: 4.3211e-06 - val_accuracy: 0.9923 - 24s/epoch - 4ms/step


******

model 2

In [57]:
# Define parameter
vocab_size = 500 
embedding_dim = 16
drop_value = 0.2
n_dense = 24
n_lstm = 128
drop_lstm = 0.2
# Define LSTM Model 
model1 = tf.keras.Sequential()
model1.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_len))
model1.add(tf.keras.layers.SpatialDropout1D(drop_lstm))
model1.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(n_lstm, return_sequences=False)))
model1.add(tf.keras.layers.Dropout(drop_lstm))
model1.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model1.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 50, 16)            8000      
                                                                 
 spatial_dropout1d_2 (Spatia  (None, 50, 16)           0         
 lDropout1D)                                                     
                                                                 
 bidirectional (Bidirectiona  (None, 256)              148480    
 l)                                                              
                                                                 
 dropout_4 (Dropout)         (None, 256)               0         
                                                                 
 dense_6 (Dense)             (None, 1)                 257       
                                                                 
Total params: 156,737
Trainable params: 156,737
Non-tr

In [58]:
model1.compile(loss = 'binary_crossentropy',
               optimizer = 'adam',
               metrics = ['accuracy'])

In [59]:
with tf.device('/device:GPU:0'):
  num_epochs = 3
  early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
  history = model1.fit(training_padded,
                     y_train,
                     epochs=num_epochs, 
                     validation_data=(testing_padded, y_test),
                     callbacks =[early_stop],
                     verbose=2)

Epoch 1/3
6774/6774 - 87s - loss: 0.0047 - accuracy: 0.9991 - val_loss: 4.6227e-06 - val_accuracy: 1.0000 - 87s/epoch - 13ms/step
Epoch 2/3
6774/6774 - 80s - loss: 3.3947e-04 - accuracy: 1.0000 - val_loss: 1.1431e-05 - val_accuracy: 1.0000 - 80s/epoch - 12ms/step
Epoch 3/3
6774/6774 - 79s - loss: 2.5337e-04 - accuracy: 0.9999 - val_loss: 2.9368e-06 - val_accuracy: 1.0000 - 79s/epoch - 12ms/step


******

In [None]:
yyy = model.predict(testing_padded)

In [None]:
yyy = pd.DataFrame(yyy)
yyy = yyy.round()
yyy.value_counts()

In [23]:
test_data = pd.read_csv('/content/drive/MyDrive/dataset/test.csv')
test_data.head()

Unnamed: 0,ID,Log
0,0,1124336301 2005.08.17 R13-M1-N8-C:J12-U01 200...
1,1,1118553175 2005.06.11 R30-M0-N9-C:J16-U01 200...
2,2,1118536033 2005.06.11 R30-M0-N9-C:J16-U01 200...
3,3,1117992566 2005.06.05 R30-M1-N6-C:J03-U11 200...
4,4,1118538965 2005.06.11 R30-M0-N9-C:J16-U01 200...


In [24]:
test_data_1 = test_data[' Log'].str.lstrip(' ')
r = test_data_1.str.partition(' ',expand=True)
test_data['Seconds'] = r[0]
s = r[2].str.partition(' ',expand=True)

t = s[2].str.partition(' ',expand=True)
test_data['weird_code'] = t[0]
u = t[2].str.partition(' ',expand=True)
test_data['time_code'] = u[0]
v = u[2].str.partition(' ',expand=True)
test_data['extra'] = v[2]
test_data['extra'] = test_data['extra'].str.rstrip('\n')
test_data.head()

Unnamed: 0,ID,Log,Seconds,weird_code,time_code,extra
0,0,1124336301 2005.08.17 R13-M1-N8-C:J12-U01 200...,1124336301,R13-M1-N8-C:J12-U01,2005-08-17-20.38.21.466368,RAS KERNEL FATAL rts: kernel terminated for re...
1,1,1118553175 2005.06.11 R30-M0-N9-C:J16-U01 200...,1118553175,R30-M0-N9-C:J16-U01,2005-06-11-22.12.55.707149,RAS KERNEL FATAL data TLB error interrupt
2,2,1118536033 2005.06.11 R30-M0-N9-C:J16-U01 200...,1118536033,R30-M0-N9-C:J16-U01,2005-06-11-17.27.13.042387,RAS KERNEL FATAL data TLB error interrupt
3,3,1117992566 2005.06.05 R30-M1-N6-C:J03-U11 200...,1117992566,R30-M1-N6-C:J03-U11,2005-06-05-10.29.26.943462,RAS KERNEL INFO generating core.6463
4,4,1118538965 2005.06.11 R30-M0-N9-C:J16-U01 200...,1118538965,R30-M0-N9-C:J16-U01,2005-06-11-18.16.05.049256,RAS KERNEL FATAL data TLB error interrupt


In [25]:
Test_data = pd.DataFrame()
Test_data['input'] = test_data['extra']
Test_data.head()

Unnamed: 0,input
0,RAS KERNEL FATAL rts: kernel terminated for re...
1,RAS KERNEL FATAL data TLB error interrupt
2,RAS KERNEL FATAL data TLB error interrupt
3,RAS KERNEL INFO generating core.6463
4,RAS KERNEL FATAL data TLB error interrupt


In [26]:
Testing_sequences = tokenizer.texts_to_sequences(Test_data['input'])
Testing_padded = tf.keras.utils.pad_sequences(Testing_sequences,
                               maxlen = max_len,
                               padding = padding_type,
                               truncating = trunc_type)
print('Shape of testing tensor: ', Testing_padded.shape)

Shape of testing tensor:  (595300, 50)


In [60]:
Y_pred = model1.predict(Testing_padded)
FT = pd.DataFrame(Y_pred)
FT.head()



Unnamed: 0,0
0,0.999943
1,0.99938
2,0.99938
3,1.0
4,0.99938


In [73]:
FT.shape

(595300, 2)

In [80]:
FT[2] = FT[0].apply('int64')

In [30]:
FT['status'] = FT[1].map({0:'abnormal',1:'normal'})

In [81]:
FT.head()

Unnamed: 0,0,1,2
0,0.999943,0,0
1,0.99938,0,0
2,0.99938,0,0
3,1.0,0,0
4,0.99938,0,0


In [52]:
FT[1].value_counts()

1.0    595300
Name: 1, dtype: int64

In [35]:
submission = pd.DataFrame()
submission['ID'] = test_data['ID']
submission[' Label'] = FT['status']
submission.head()

Unnamed: 0,ID,Label
0,0,abnormal
1,1,abnormal
2,2,abnormal
3,3,normal
4,4,abnormal


In [36]:
submission.to_csv('submission.csv',index=False)