# importing the libraries

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# importing the training file

In [3]:
df = pd.read_json("/kaggle/input/convolve-epoch1/train.json", orient="index").reset_index()
# renaming columns
df.rename(mapper={
        "index": " Log",
        0: "label"
    }, 
    axis="columns",
    inplace=True)

df

Unnamed: 0,Log,label
0,1117838570 2005.06.03 R02-M1-N0-C:J12-U11 200...,normal
1,1117838570 2005.06.03 R02-M1-N0-C:J12-U11 200...,normal
2,1117838570 2005.06.03 R02-M1-N0-C:J12-U11 200...,normal
3,1117838570 2005.06.03 R02-M1-N0-C:J12-U11 200...,normal
4,1117838570 2005.06.03 R02-M1-N0-C:J12-U11 200...,normal
...,...,...
4152654,1118545530 2005.06.11 R30-M0-N9-C:J16-U01 2005...,abnormal
4152655,1118545530 2005.06.11 R30-M0-N9-C:J16-U01 2005...,abnormal
4152656,1118545530 2005.06.11 R30-M0-N9-C:J16-U01 2005...,abnormal
4152657,1118545530 2005.06.11 R30-M0-N9-C:J16-U01 2005...,abnormal


In [4]:
# under sampling the data because the number of label was normal

In [5]:
df.loc[1,' Log']

' 1117838570 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.50.527847 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache parity error corrected\n'

# Preprocessing

In [6]:
col = [0, 1, 2, 3, 9]
df[col] = df[" Log"].str.split(n=4, expand=True)
df[9] = df.apply(lambda x: x[9].split(" ", 1)[1] if x[2] != "-" else x[9], axis="columns")
df[9] = df[9].str.split(n=1, expand=True)[1]

df.rename(mapper={9: "Details"}, axis="columns", inplace=True)
df["Abnormal"] = df["label"].map(lambda x: 1 if x == "abnormal" else 0)
df = df[["Details", "Abnormal"]]
print("preprocess done")
df

preprocess done


Unnamed: 0,Details,Abnormal
0,KERNEL INFO instruction cache parity error cor...,0
1,KERNEL INFO instruction cache parity error cor...,0
2,KERNEL INFO instruction cache parity error cor...,0
3,KERNEL INFO instruction cache parity error cor...,0
4,KERNEL INFO instruction cache parity error cor...,0
...,...,...
4152654,KERNEL FATAL data TLB error interrupt,1
4152655,KERNEL FATAL data TLB error interrupt,1
4152656,KERNEL FATAL data TLB error interrupt,1
4152657,KERNEL FATAL data TLB error interrupt,1


In [7]:
df2=df.loc[df.Abnormal==1]

In [8]:
df3=df.loc[df.Abnormal==0]

In [9]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4082967 entries, 0 to 4082966
Data columns (total 2 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   Details   object
 1   Abnormal  int64 
dtypes: int64(1), object(1)
memory usage: 93.5+ MB


In [10]:
df2=df2.sample(len(df2))

In [11]:
df2.head()

Unnamed: 0,Details,Abnormal
4149160,KERNEL FATAL data TLB error interrupt,1
4102290,KERNEL FATAL data TLB error interrupt,1
4122456,KERNEL FATAL data TLB error interrupt,1
4136051,KERNEL FATAL data TLB error interrupt,1
4097986,KERNEL FATAL data TLB error interrupt,1


In [12]:
df=pd.concat([df2,df3])

In [13]:
from nltk.stem.porter import PorterStemmer as port_stm
import re
from nltk.corpus import stopwords as st
def normalise_txt(df):
    allwords = st.words("english")
    allwords.remove("not")
    allwords.remove("t")
    allwords.remove("no")
    allwords.remove("nor")
    ps = port_stm()
    corpus = []
    for i in range(len(df)):
        detes = re.sub("[^a-zA-Z]", " ", df.loc[i, "Details"]).lower().split()
        detes = [ps.stem(word) for word in detes if word not in allwords and word != "error"]
        detes = " ".join(detes)
        corpus.append(detes)
    print("normalised")
    return corpus
    
corpus = normalise_txt(df)

normalised


# Preparing the dataset for prediction

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
def extract_features(df, corpus):
    cv = CountVectorizer(max_features=200)
    x = cv.fit_transform(corpus).toarray()
    print("extraction")
    return x

x = extract_features(df, corpus)
y = df["Abnormal"].values

extraction


In [15]:
x.shape

(4152659, 200)

In [16]:
y.shape

(4152659,)

In [17]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.25, random_state=1)

In [19]:
xtrain

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# Model Selection

In [None]:
import tensorflow as tf

In [None]:
x=np.asarray(x).astype(np.int)

y=np.asarray(y).astype(np.int)

In [None]:
"""model=tf.keras.models.Sequential()
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(128,activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(1,activation=tf.nn.softmax))

model.compile(optimizer='adam',loss=tf.keras.losses.BinaryCrossentropy(),
             metrics=[tf.keras.metrics.BinaryAccuracy()])

model.fit(x, y, epochs=1)"""

In [None]:
"""from sklearn.naive_bayes import GaussianNB
clsfr = GaussianNB()
clsfr.fit(xtrain, ytrain)

In [None]:

"""from sklearn.neighbors import KNeighborsClassifier
clsfr = KNeighborsClassifier(n_neighbors=10)
clsfr.fit(xtrain, ytrain)

from sklearn.svm import  SVC
clsfr = SVC()
clsfr.fit(xtrain, ytrain)"""


from xgboost import XGBClassifier as xgbc
clsfr = xgbc(base_score=0.54, grow_policy="lossguide")
clsfr.fit(xtrain, ytrain)

"""from sklearn.ensemble import RandomForestClassifier
clsfr = RandomForestClassifier(n_estimators=10)
clsfr.fit(xtrain, ytrain)

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
clsfr = GridSearchCV(DecisionTreeClassifier(),
             {
                 "max_depth": [2, 3, 4, None],
                 "criterion": ["gini", "entropy"],
                 "min_samples_split": [2, 50, 20],
                 "max_leaf_nodes": [None, 1, 2]
             },
            scoring="f1_micro",
            verbose=4)
clsfr.fit(xtrain, ytrain)
"""

"""from sklearn.neural_network import MLPClassifier
clsfr = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15,20))
clsfr.fit(xtrain, ytrain)

from sklearn.naive_bayes import GaussianNB
clsfr = GaussianNB()
clsfr.fit(xtrain, ytrain)

from sklearn.linear_model import LogisticRegression
clsfr = LogisticRegression(penalty="l1", solver="saga")
clsfr.fit(xtrain, ytrain)

from sklearn.naive_bayes import MultinomialNB
clsfr = MultinomialNB(fit_prior=True)
clsfr.fit(xtrain, ytrain)

from sklearn.ensemble import RandomForestClassifier
clsfr = RandomForestClassifier(n_estimators=15)
clsfr.fit(xtrain, ytrain)

"""
print()

# Evaluating a model

In [1]:
pred = clsfr.predict(xtest)

from sklearn.metrics import confusion_matrix
print(confusion_matrix(ytest, pred))

NameError: name 'clsfr' is not defined

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print("Accuracy =",accuracy_score(ytest, pred))
print("Precision =", precision_score(ytest, pred))
print("Recall = ", recall_score(ytest, pred))
print("F-score = ", f1_score(ytest, pred))

In [None]:
# K-fold Cross Validation
"""
from sklearn.model_selection import KFold, cross_val_score
kf = KFold(n_splits=10)
scores = pd.Series(cross_val_score(clsfr, xtrain, ytrain, cv=kf))

print(scores)
scores.describe()
"""

# Working on the test data

## Importing and Preprocessing

In [None]:
test = pd.read_csv("/kaggle/input/convolve-epoch1/test.csv")

columns = [0, 1, 2, 3, 9]
test[columns] = test[" Log"].str.split(n=4, expand=True)
test[9] = test.apply(lambda x: x[9].split(" ", 1)[1] if x[2] != "-" else x[9], axis="columns")
test[9] = test[9].str.split(n=1, expand=True)[1]
test.rename(mapper={9: "Details"}, axis="columns", inplace=True)

In [None]:
corpus = normalise_txt(test)
x = extract_features(test, corpus)

## Predicting the test dataset

In [None]:
pred = clsfr.predict(x)

In [None]:
result = pd.Series(pred)
result = pd.DataFrame({
    "ID": test["ID"],
    " Label": result
})
result[" Label"] = result[" Label"].map(lambda x: "normal" if x == 0 else "abnormal")
result

## Storing the results in submission.csv

In [None]:
result.to_csv("submission.csv", index=False)