# MLOps Assignment 1
# Text Classification - **Testing**

## Model Deployment

### Load the saved model

In [1]:
import pickle

# Define the file path where the trained model is saved
model_file_path = "naive_bayes_model.pkl"

# Load the saved Naive Bayes model from the file
with open(model_file_path, 'rb') as file:
    loaded_model = pickle.load(file)

print("Trained model loaded successfully")

Trained model loaded successfully


### Functions to Preprocess Text

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

In [3]:
# Function to remove punctuations from text
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def remove_punctuation(text):
    regular_punct = string.punctuation
    #return re.sub(r'[#!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]', '', str(text))
    return str(re.sub(r'['+regular_punct+']', '', str(text)))

# Function to remove URLs from text
def remove_urls(text):
    return re.sub(r'http[s]?://\S+', '', text)

# Function to convert the text into lower case
def lower_case(text):
    return text.lower()

# Function to lemmatize text
def lemmatize(text):
  wordnet_lemmatizer = WordNetLemmatizer()

  tokens = nltk.word_tokenize(text)
  lemma_txt = ''
  for w in tokens:
    lemma_txt = lemma_txt + wordnet_lemmatizer.lemmatize(w) + ' '

  return lemma_txt

### Function to Predict Class

In [None]:
import torch
from transformers import AutoTokenizer

# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Tokenize the text data
def tokenize_data(data):
    return tokenizer(data, return_tensors='pt', padding=True, truncation=True)

# Convert tokenized data into tensors
def convert_to_tensors(data):
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    
    # Pad or truncate input_ids and attention_mask to ensure they have length 512
    max_length = 512
    padded_input_ids = torch.nn.functional.pad(input_ids, (0, max_length - input_ids.size(1)), value=tokenizer.pad_token_id)
    padded_attention_mask = torch.nn.functional.pad(attention_mask, (0, max_length - attention_mask.size(1)), value=0)
    
    data = {'input_ids': padded_input_ids, 'attention_mask': padded_attention_mask}
    return {key: torch.tensor(val) for key, val in data.items()}

def predict_class(input_text, model):
    # Preprocess input text
    input_text = remove_punctuation(input_text)
    input_text = remove_urls(input_text)
    input_text = lower_case(input_text)
    input_text = lemmatize(input_text)

    # Tokenize input text
    tokenized_text = tokenize_data(str(input_text))
    
    # Convert tokenized data into tensors
    data_tensors = convert_to_tensors(tokenized_text)
    
    # Convert PyTorch tensors to NumPy arrays
    data_numpy = data_tensors['input_ids'].numpy()
    
    # Reshape the data to match Naive Bayes' input requirements
    data_flattened = data_numpy.reshape(data_numpy.shape[0], -1)
    
    # Make predictions using the model
    predicted_class = model.predict(data_flattened)
    return predicted_class

## Model Testing

In [38]:
# Sample Text 1:
input_text = "Great movie. I liked it!"
predicted_class = predict_class(input_text, loaded_model)
print(f"Predicted class: {predicted_class}")

Predicted class: [1]


In [28]:
# Sample Text 2:
input_text = "I was extremely disappointed with this movie. The plot was predictable, the characters were one-dimensional, and the dialogue felt forced. It seemed like the filmmakers put more effort into the visuals than into crafting a compelling story. Overall, I found it to be a waste of time and money."
predicted_class = predict_class(input_text, loaded_model)
print(f"Predicted class: {predicted_class}")

Predicted class: [0]


In [29]:
# Sample Text 3:
input_text = "Good movie!"
predicted_class = predict_class(input_text, loaded_model)
print(f"Predicted class: {predicted_class}")

Predicted class: [1]


In [30]:
# Sample Text 4:
input_text = "Extremely bad movie. I hate it!"
predicted_class = predict_class(input_text, loaded_model)
print(f"Predicted class: {predicted_class}")

Predicted class: [0]


In [31]:
# Sample Text 5:
input_text = "I recently watched the latest Marvel movie, and I have to say, it was absolutely fantastic! From start to finish, the film kept me entertained with its thrilling action sequences, witty humor, and engaging storyline. The special effects were top-notch, and the performances by the cast were outstanding. I couldn't help but be immersed in the world of the movie, and I left the theater feeling exhilarated and wanting more. It's definitely a must-watch for any Marvel fan, and I can't wait to see it again!"
predicted_class = predict_class(input_text, loaded_model)
print(f"Predicted class: {predicted_class}")

Predicted class: [0]
