In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pickle as pk


model_svc = pk.load(open('model.pkl', 'rb'))
tv = pk.load(open('tv.pkl', 'rb'))
lb = pk.load(open('lb.pkl', 'rb'))


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
ps = PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
sailor_df = pd.read_json('sailorapp.json')
input_data = sailor_df['Description']

In [5]:
def preprocess_text(text):
   
    review = re.sub('[^a-zA-Z]', ' ', text)
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    return review

def predict_descriptions(input_data):
    batch_size = 100
    num_batches = (len(input_data) // batch_size) + 1
    predictions = []
    confidences = []

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(input_data))
        batch_descriptions = input_data.iloc[start_idx:end_idx]
        processed_descriptions = batch_descriptions.apply(preprocess_text)
        vectorized_data = tv.transform(processed_descriptions)
       
        dense_vectorized_data = vectorized_data.toarray()
        
        batch_predictions = model_svc.predict(dense_vectorized_data)
        batch_confidences = model_svc.predict_proba(dense_vectorized_data)

        predictions.extend(batch_predictions)
        confidences.extend(batch_confidences[:, 1])

    predicted_labels = lb.inverse_transform(predictions)
    return predicted_labels, confidences

In [6]:
predictions, confidences = predict_descriptions(input_data)

predictions_df = pd.DataFrame(input_data, columns=['Description'])
predictions_df['Predicted_Label'] = predictions
predictions_df['Confidence_Score'] = confidences

In [7]:
predictions_df

Unnamed: 0,Description,Predicted_Label,Confidence_Score
0,Steps to reproduce:\r\n\r\n1) Launch 'VV App' ...,Valid,0.892974
1,Steps to reproduce:\r\n\r\n1) Launch 'VV App' ...,Valid,0.885809
2,Steps to reproduce:\r\n\r\n1) Launch 'VV App' ...,Valid,0.882104
3,"*Known issue, QA team is raising it for tracki...",Valid,0.864006
4,"*Known issue, QA team is raising it for tracki...",Valid,0.885835
...,...,...,...
5422,Build: PI7.6\n\nUnable to launch Crew App appl...,Valid,0.865099
5423,Date: 19th Jul 202\r\nVoyage Roll Over Took Pl...,Invalid,0.253609
5424,As per the definition in the Air Table [https:...,Valid,0.885852
5425,1. Select Future Voyage from the Voyage Drop D...,Invalid,0.151274
