<a href="https://colab.research.google.com/github/ayushmothiya/la-la-la/blob/main/ConvolveEpoch1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [None]:
ROOTPATH = "/content/drive/MyDrive/"
TRAINPATH = ROOTPATH + "refined_data.csv"
TESTPATH = ROOTPATH + "test.csv"

In [None]:
def load_data(filepath):
    df = pd.read_csv(filepath)

    # Data Augmentation
    df = df.drop(["time_code", "log"], axis=1)
    renaming_dictionary = {"Unnamed: 0": "ID"}
    
    df = df.rename(columns=renaming_dictionary)
    df = df.drop_duplicates(subset=["extra","status"])

    extra_string_array = ["RAS KERNEL FATAL rts tree/torus link training failed: wanted:",
                          "RAS APP FATAL ciod: failed to read message prefix on control stream",
                          "RAS KERNEL FATAL data address:",
                          "RAS KERNEL FATAL 12:28244842 13:1eeeeeee 14:ffffffff",
                          "RAS KERNEL FATAL instruction address:",
                          "RAS KERNEL FATAL fpr"]
    for i in extra_string_array:
        df.loc[df["extra"].str.contains(i), "extra"] = i

    return df

In [None]:
df_train = load_data(TRAINPATH)

In [None]:
print(df_train.shape)
df_train.head()

(270939, 7)


Unnamed: 0,ID,status,date,time,Seconds,weird_code,extra
0,0,1,2005-06-03,15.42.50,1117838570,R02-M1-N0-C:J12-U11,RAS KERNEL INFO instruction cache parity error...
1476,1476,1,2005-06-03,15.47.20,1117838840,R27-M1-L3-U18-C,RAS LINKCARD INFO MidplaneSwitchController per...
2734,2734,1,2005-06-03,15.51.25,1117839085,R20-M1-N5-C:J17-U01,RAS KERNEL INFO generating core.304\n
2735,2735,1,2005-06-03,15.51.25,1117839085,R20-M1-NF-C:J13-U01,RAS KERNEL INFO generating core.17\n
2737,2737,1,2005-06-03,15.51.25,1117839085,R20-M1-N9-C:J17-U01,RAS KERNEL INFO generating core.784\n


In [None]:
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-distilroberta-v1')

In [None]:
embeddings_extra = model.encode(df_train.extra.values.tolist())

KeyboardInterrupt: ignored

In [None]:
embeddings_extra.shape

(270939, 768)

In [None]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

df_train_X = np.asarray(embeddings_extra)
df_train_y = df_train.status.values

# X_train, X_test, y_train, y_test = train_test_split(df_train_X, df_train_y, test_size=0.33, random_state=42)

from sklearn.svm import SVC
# clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
# clf.fit(X_train, y_train)

In [None]:
# from sklearn.metrics import f1_score
# print(f1_score(clf.predict(X_test), y_test, average='macro'))

In [None]:
# print(f1_score(clf.predict(X_train), y_train, average='macro'))

In [None]:
clf_whole = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf_whole.fit(df_train_X,df_train_y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto'))])

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(objective='binary:logistic',random_state=42)
xgb_model.fit(df_train_X, df_train_y)

XGBClassifier(random_state=42)

In [None]:
!pip -q install torchmetrics

[?25l[K     |▋                               | 10 kB 29.7 MB/s eta 0:00:01[K     |█▎                              | 20 kB 36.7 MB/s eta 0:00:01[K     |██                              | 30 kB 45.2 MB/s eta 0:00:01[K     |██▋                             | 40 kB 43.5 MB/s eta 0:00:01[K     |███▏                            | 51 kB 47.2 MB/s eta 0:00:01[K     |███▉                            | 61 kB 52.3 MB/s eta 0:00:01[K     |████▌                           | 71 kB 33.3 MB/s eta 0:00:01[K     |█████▏                          | 81 kB 34.7 MB/s eta 0:00:01[K     |█████▊                          | 92 kB 37.0 MB/s eta 0:00:01[K     |██████▍                         | 102 kB 38.8 MB/s eta 0:00:01[K     |███████                         | 112 kB 38.8 MB/s eta 0:00:01[K     |███████▊                        | 122 kB 38.8 MB/s eta 0:00:01[K     |████████▎                       | 133 kB 38.8 MB/s eta 0:00:01[K     |█████████                       | 143 kB 38.8 MB/s eta 0:

In [None]:
import torch
from torch import nn
from torchmetrics import Accuracy

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Model(nn.Module):
    def __init__(self,in_features=768,out_features=1):
        super().__init__()

        self.blackbox = nn.Sequential(nn.Linear(in_features, 1024),
                                      nn.ReLU(),
                                      nn.Linear(1024, 256),
                                      nn.ReLU(),
                                      nn.Linear(256, 32),
                                      nn.ReLU(),
                                      nn.Linear(32,out_features),
                                      nn.Sigmoid())

    def forward(self, x):
        return self.blackbox(x)

blackboxmodel = Model().to(device)

loss_fn = nn.BCELoss()
optimizer = torch.optim.SGD(params=model.parameters(), lr=0.1)
acc_fn = Accuracy(task="binary").to(device)

X_train, X_test, y_train, y_test = train_test_split(df_train_X, df_train_y, test_size=0.2, random_state=42)

X_train, y_train = X_train.to(device), y_train.to(device)
X_test, y_test = X_test.to(device), y_test.to(device)

epochs = 3000

for epoch in range(epochs):
    blackboxmodel.train()
    y_prob_pred = blackboxmodel(X_train).squeeze()
    y_pred = torch.round(y_prob_pred)

    loss = loss_fn(y_prob_pred, y_train)
    acc = acc_fn(y_pred, y_train.int())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


    if epoch % 100 == 0:
        blackboxmodel.eval() 
        with torch.inference_mode():
            y_prob_pred = blackboxmodel(X_test).squeeze()
            y_pred = torch.round(y_prob_pred)

            test_loss = loss_fn(y_prob_pred, y_test) 
            test_acc = acc_fn(y_pred, y_test.int())

        print(f"Epoch: {epoch} | Loss: {loss:.5f}, Accuracy: {acc:.2f}% | Test loss: {test_loss:.5f}, Test acc: {test_acc:.2f}%")

AttributeError: ignored

In [None]:
test_data = pd.read_csv('/content/drive/MyDrive/test.csv')

test_data_1 = test_data[' Log'].str.lstrip(' ')
r = test_data_1.str.partition(' ',expand=True)
test_data['Seconds'] = r[0]
s = r[2].str.partition(' ',expand=True)

t = s[2].str.partition(' ',expand=True)
test_data['weird_code'] = t[0]
u = t[2].str.partition(' ',expand=True)
test_data['time_code'] = u[0]
v = u[2].str.partition(' ',expand=True)
test_data['extra'] = v[2]
test_data['extra'] = test_data['extra'].str.rstrip('\n')
test_data.head()

renaming_dictionary = {"Unnamed: 0": "index", "extra": "input", "status": "msg_type"}

extra_string_array = ["RAS KERNEL FATAL rts tree/torus link training failed: wanted:",
                        "RAS APP FATAL ciod: failed to read message prefix on control stream",
                        "RAS KERNEL FATAL data address:",
                        "RAS KERNEL FATAL 12:28244842 13:1eeeeeee 14:ffffffff",
                        "RAS KERNEL FATAL instruction address:",
                        "RAS KERNEL FATAL fpr"]
for i in extra_string_array:
    test_data.loc[test_data["extra"].str.contains(i), "extra"] = i


test_data = test_data.drop(["time_code", " Log","Seconds","weird_code"], axis=1)

test_data = test_data.rename(columns=renaming_dictionary)
test_data.head()

Unnamed: 0,ID,input
0,0,RAS KERNEL FATAL rts: kernel terminated for re...
1,1,RAS KERNEL FATAL data TLB error interrupt
2,2,RAS KERNEL FATAL data TLB error interrupt
3,3,RAS KERNEL INFO generating core.6463
4,4,RAS KERNEL FATAL data TLB error interrupt


In [None]:
test_embeddings = model.encode(test_data.input.values.tolist())
df_test_X = np.asarray(test_embeddings)

In [None]:
# y_pred1 = clf.predict(df_test_X)
# y_pred2 = clf_whole.predict(df_test_X)
# y_pred3 = xgb_model.predict(df_test_X)

In [None]:
y_pred_svm = clf_whole.predict(df_test_X) 
y_pred_xgb = xgb_model.predict(df_test_X)

In [None]:
submission = pd.DataFrame()
submission['ID'] = test_data['ID']
submission[' Label_svm'] = y_pred_svm
submission[' Label_xgb'] = y_pred_xgb

In [None]:
submission.to_csv('models.csv',index=False)

In [None]:
submission.head()

In [None]:
submission[' Label'] = submission[' Label'].map({0:'abnormal',1:'normal'})
submission.to_csv('xgboost_submission.csv',index=False)

In [None]:
submission.head()

In [None]:
y_pred1.sum(), y_pred1.shape

In [None]:
df_ayush = pd.read_csv("/content/submission.csv")

my_preds = submission[' Label'].values
ayush_preds = df_ayush[' Label'].values

print(sum(my_preds == ayush_preds))

In [None]:
test_data[test_data.input.str.contains('RAS KERNEL FATAL')]

In [None]:
test_data.shape