In [None]:
# Reading the data
from google.colab import files
uploaded = files.upload()


Saving RCA_V3.csv to RCA_V3.csv


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.feature_extraction.text import CountVectorizer



# Step 2: Drop unnecessary columns and rename cols
# Assuming we need only 'text' and 'label' columns
data = (pd.read_csv('RCA_V3.csv',sep=None,engine='python', usecols = ['RCA_Main_Root_Cause','RCA Defect Domain'])).dropna(how='all')
data.columns = ['text', 'label']
data.head()
print(data)

# Step 3: Check for missing values
missing_values = data.isnull().sum()
print("Missing values:\n", missing_values)

# Step 4: Check data shape
print("Data shape:", data.shape)

# Step 5: Check target balance
data['label'].value_counts().plot(kind='bar')
plt.title('Target Balance')
plt.show()

# Step 6: Fill NaN in 'text' column with an empty string
data['text'].fillna('', inplace=True)

# Step 7: Create Feature and Label sets
X = data['text']
y = data['label']

# Step 8: Assuming 'text' column may have NaN values, fill NaN with an empty string (Redundant)
X.fillna('', inplace=True)

# Step 9: Encode labels to integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Step 10: Train-test split (67% train - 33% test)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.33, random_state=42)

# Step 11: Train Bag of Words model
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Step 12: Reverse the encoding to get original class labels
original_labels = label_encoder.inverse_transform(y_encoded)

# Step 13: Tokenization and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
max_len = max(len(seq) for seq in X_train_seq)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Step 14: Convert labels to one-hot encoding using numpy
y_train_oh = to_categorical(y_train)
y_test_oh = to_categorical(y_test)

# Step 15: Define RNN
vocab_size = len(tokenizer.word_index) + 1

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

# Train the model
history = model.fit(X_train_pad, y_train_oh, epochs=5, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_pad, y_test_oh)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

ModuleNotFoundError: No module named 'keras.preprocessing.text'

In [None]:
pip install pandas scikit-learn shap tensorflow matplotlib


Collecting shap
  Downloading shap-0.46.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (24 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Downloading shap-0.46.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (540 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m540.1/540.1 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading slicer-0.0.8-py3-none-any.whl (15 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.46.0 slicer-0.0.8


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

data.plot(kind='bar', figsize=(10, 6))

plt.title('Confusion Matrix')
plt.xlabel('True Labels')
plt.ylabel('Predicted Labels')
plt.xticks(rotation=45)
plt.show()


NameError: name 'data' is not defined

In [None]:
import pandas as pd
from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
import shap
import tensorflow as tf

# Initialize the JS visualization code for SHAP
shap.initjs()

# Use DeepExplainer to explain predictions
background = X_train_pad[:100]  # Using a subset of training data for background
#explainer = shap.DeepExplainer(model, background)
#shap_values = explainer.shap_values(X_test_pad[:10])  # Explain predictions for a subset of test data


# Plot the SHAP values for the first instance
shap.force_plot(explainer.expected_value[0], shap_values[0][0], X_test_pad[0])

# Summary plot for a larger overview of the feature importances
shap.summary_plot(shap_values[0], X_test_pad[:10], feature_names=[tokenizer.index_word.get(i, 'UNK') for i in range(1, 10001)])

# Dependence plot for a specific feature
chosen_feature_index = 0  # Index of the feature you are interested in
shap.dependence_plot(chosen_feature_index, shap_values[0], X_test_pad[:10], feature_names=[tokenizer.index_word.get(i, 'UNK') for i in range(1, 10001)])