In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Input, GlobalMaxPooling1D, Concatenate
from tensorflow.keras.models import Model
from transformers import BertTokenizer, TFBertModel
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

# Load and preprocess the data
file_path = "C:\\college\\RCA_project\\cleaned\\jira_priority_cleaned.csv"
df = pd.read_csv(file_path, encoding='latin1')

# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['Summary'] = df['Summary'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# Tokenization and Embedding
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
input_ids = tokenizer(df['Summary'].tolist(), padding=True, truncation=True, return_tensors="tf")['input_ids']

# Get BERT embeddings
bert_embeddings = bert_model(input_ids)[0]

# RNN-LSTM Model
def create_model(input_shape):
    # Input for the BERT embeddings
    bert_input = Input(shape=(input_shape[0], input_shape[1]), dtype=tf.float32)
    
    # Additional GloVe embedding layer
    glove_input = Input(shape=(input_shape[0],), dtype=tf.int32)
    glove_embedding_layer = Embedding(input_dim=50000, output_dim=300, input_length=input_shape[0])(glove_input)
    
    # Combine BERT and GloVe embeddings
    combined = Concatenate()([bert_input, glove_embedding_layer])
    lstm_out = Bidirectional(LSTM(64, return_sequences=True))(combined)
    pooling = GlobalMaxPooling1D()(lstm_out)
    dense = Dense(64, activation='relu')(pooling)
    outputs = Dense(4, activation='softmax')(dense)
    
    model = Model(inputs=[bert_input, glove_input], outputs=outputs)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Stratified k-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y = pd.get_dummies(df['Priority']).values

accuracies = []
for train_index, test_index in skf.split(input_ids, df['Priority']):
    X_train_bert, X_test_bert = tf.gather(bert_embeddings, train_index), tf.gather(bert_embeddings, test_index)
    X_train_glove, X_test_glove = tf.gather(input_ids, train_index), tf.gather(input_ids, test_index)
    y_train, y_test = tf.gather(y, train_index), tf.gather(y, test_index)
    
    model = create_model((X_train_bert.shape[1], X_train_bert.shape[2]))
    model.fit([X_train_bert, X_train_glove], y_train, epochs=100, batch_size=32, validation_data=([X_test_bert, X_test_glove], y_test))
    _, accuracy = model.evaluate([X_test_bert, X_test_glove], y_test)
    accuracies.append(accuracy)

    # Calculate accuracy using sklearn's accuracy_score
    y_pred = np.argmax(model.predict([X_test_bert, X_test_glove]), axis=1)
    y_true = np.argmax(y_test, axis=1)
    accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy: {accuracy * 100:.2f}%')

average_accuracy = np.mean(accuracies)
print(f'Average accuracy: {average_accuracy:.2f}')

# Predictions and Output
predictions = model.predict([bert_embeddings, input_ids])
predicted_labels = np.argmax(predictions, axis=1)

# Convert predicted_labels to a Pandas Series
predicted_labels_series = pd.Series(predicted_labels)

# Map numeric predictions to labels
label_mapping = {0: 'blocker', 1: 'critical', 2: 'major', 3: 'minor'}
df['Predicted Priority'] = predicted_labels_series.map(label_mapping)

# Displaying results
priority_counts = df['Predicted Priority'].value_counts()
print(priority_counts)

# Saving results to CSV for search and sort functionality
df.to_csv('predicted_priorities_final.csv', index=False)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\91882\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initializ

Epoch 1/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 558ms/step - accuracy: 0.6214 - loss: 1.1098 - val_accuracy: 0.6481 - val_loss: 1.0171
Epoch 2/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 759ms/step - accuracy: 0.6405 - loss: 1.0084 - val_accuracy: 0.6481 - val_loss: 1.0035
Epoch 3/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 865ms/step - accuracy: 0.6321 - loss: 0.9565 - val_accuracy: 0.6481 - val_loss: 1.0000
Epoch 4/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 269ms/step - accuracy: 0.6518 - loss: 0.8502 - val_accuracy: 0.6481 - val_loss: 0.9611
Epoch 5/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 566ms/step - accuracy: 0.7205 - loss: 0.7078 - val_accuracy: 0.6481 - val_loss: 0.9653
Epoch 6/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 420ms/step - accuracy: 0.7538 - loss: 0.5680 - val_accuracy: 0.6574 - val_loss: 0.9547
Epoch 7/100
[1m14/



[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 170ms/step - accuracy: 0.5363 - loss: 1.1318 - val_accuracy: 0.6542 - val_loss: 1.0354
Epoch 2/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 127ms/step - accuracy: 0.6666 - loss: 0.9707 - val_accuracy: 0.6542 - val_loss: 1.0179
Epoch 3/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 123ms/step - accuracy: 0.6708 - loss: 0.9090 - val_accuracy: 0.6542 - val_loss: 1.0210
Epoch 4/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 124ms/step - accuracy: 0.6413 - loss: 0.8751 - val_accuracy: 0.6355 - val_loss: 1.0229
Epoch 5/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 120ms/step - accuracy: 0.7237 - loss: 0.6880 - val_accuracy: 0.6075 - val_loss: 1.0394
Epoch 6/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 122ms/step - accuracy: 0.8229 - loss: 0.5382 - val_accuracy: 0.5981 - val_loss: 1.0824
Epoch 7/100
[1m14/14[0m [32m━━━



[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 241ms/step - accuracy: 0.5664 - loss: 1.1740 - val_accuracy: 0.6542 - val_loss: 1.0310
Epoch 2/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 203ms/step - accuracy: 0.6799 - loss: 0.9570 - val_accuracy: 0.6542 - val_loss: 1.0065
Epoch 3/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 203ms/step - accuracy: 0.6761 - loss: 0.9125 - val_accuracy: 0.6542 - val_loss: 0.9874
Epoch 4/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 206ms/step - accuracy: 0.6940 - loss: 0.7953 - val_accuracy: 0.6636 - val_loss: 1.0162
Epoch 5/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 209ms/step - accuracy: 0.6864 - loss: 0.7816 - val_accuracy: 0.6542 - val_loss: 0.9566
Epoch 6/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 210ms/step - accuracy: 0.7292 - loss: 0.6104 - val_accuracy: 0.6168 - val_loss: 0.9830
Epoch 7/100
[1m14/14[0m [32m━━━



[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 230ms/step - accuracy: 0.5059 - loss: 1.2001 - val_accuracy: 0.6636 - val_loss: 0.9978
Epoch 2/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 171ms/step - accuracy: 0.6540 - loss: 0.9830 - val_accuracy: 0.6636 - val_loss: 1.0040
Epoch 3/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 170ms/step - accuracy: 0.6734 - loss: 0.9177 - val_accuracy: 0.6636 - val_loss: 0.9852
Epoch 4/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 175ms/step - accuracy: 0.6378 - loss: 0.8716 - val_accuracy: 0.6636 - val_loss: 0.9118
Epoch 5/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 166ms/step - accuracy: 0.6664 - loss: 0.7481 - val_accuracy: 0.6542 - val_loss: 0.9383
Epoch 6/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 171ms/step - accuracy: 0.7936 - loss: 0.5979 - val_accuracy: 0.6822 - val_loss: 0.9409
Epoch 7/100
[1m14/14[0m [32m━━━



[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 194ms/step - accuracy: 0.5406 - loss: 1.1723 - val_accuracy: 0.6542 - val_loss: 1.0192
Epoch 2/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 157ms/step - accuracy: 0.6798 - loss: 0.9648 - val_accuracy: 0.6542 - val_loss: 1.0160
Epoch 3/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 159ms/step - accuracy: 0.6878 - loss: 0.9497 - val_accuracy: 0.6542 - val_loss: 1.0018
Epoch 4/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 159ms/step - accuracy: 0.6620 - loss: 0.9122 - val_accuracy: 0.6542 - val_loss: 0.9747
Epoch 5/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 157ms/step - accuracy: 0.6449 - loss: 0.8485 - val_accuracy: 0.6636 - val_loss: 0.9786
Epoch 6/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 161ms/step - accuracy: 0.7465 - loss: 0.7039 - val_accuracy: 0.6636 - val_loss: 0.9652
Epoch 7/100
[1m14/14[0m [32m━━━

In [None]:
import matplotlib.pyplot as plt

# Plotting Pareto Chart
def pareto_chart(data, title="Pareto Chart"):
    sorted_counts = data.sort_values(ascending=False)
    cumulative_percentage = sorted_counts.cumsum() / sorted_counts.sum() * 100

    fig, ax1 = plt.subplots(figsize=(10, 6))

    # Plotting the bar chart
    sorted_counts.plot(kind='bar', color='skyblue', ax=ax1)
    ax1.set_ylabel('Frequency', color='skyblue')
    ax1.set_title(title)

    # Plotting the cumulative percentage line
    ax2 = ax1.twinx()
    cumulative_percentage.plot(marker='o', color='red', ax=ax2)
    ax2.set_ylabel('Cumulative Percentage', color='red')

    # Show grid for better readability
    ax1.grid(True)

    # Show plot
    plt.show()

# Create Pareto Chart for Predicted Priority Labels
pareto_chart(priority_counts, title="Pareto Chart of Predicted Priorities")
