In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

import pandas as pd

from tkinter import *
from tkinter import messagebox
from tkinter import ttk

# Create an instance of tkinter frame
win = Tk()

# Set the size of the tkinter window
win.geometry("700x350")

# Define a function to show the popup message with the input text
def show_msg():
    user_input = entry.get()  # Get input from the text box
    text = preprocessing(user_input)
    embeding = wordEmbedBert(text)
    sentiment = preditBasedOntext(embeding)
    if user_input:
        messagebox.showinfo("Message", f"Based on the text that you have entered, the sentiment is:\n{sentiment[0]}")
    else:
        messagebox.showwarning("Message", "Please enter some text.")

def clear_text():
    entry.delete(0, END)  # Clear the text entry box

# Add an optional Label widget
Label(win, text="Welcome to sentiment analysis", font=('Arial 17 bold italic')).pack(pady=30)

# Create a Text Entry widget
entry = Entry(win, font=('Arial 12'), width=50)  # Increase width to accommodate more text
entry.pack(pady=10)

# Create a Button to display the message
ttk.Button(win, text="Check Sentiment", command=show_msg).pack(pady=10)

# Create a "Clear" Button to clear the text
ttk.Button(win, text="Clear", command=clear_text).pack(pady=10)

win.mainloop()


[nltk_data] Downloading package punkt to /home/justin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/justin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/justin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
def preprocessing(text):
    # Initialize NLP Preprocessing functions

    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Preprocess the "tweets" column in place

    sentence = text
        
    # Tokenize the sentence
    tokens = word_tokenize(sentence)
    
    # Remove punctuation and convert to lowercase
    tokens = [token.lower() for token in tokens if token.isalnum()]
    
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back into a preprocessed sentence
    preprocessed_sentence = ' '.join(lemmatized_tokens)
    
    return preprocessed_sentence

In [8]:
from transformers import BertTokenizer, BertModel
import torch
import pandas as pd

def wordEmbedBert(text):
    # Load the BERT model and tokenizer
    model_name = 'bert-base-uncased'
    model = BertModel.from_pretrained(model_name)
    tokenizer = BertTokenizer.from_pretrained(model_name)


    # Initialize lists to store sentence embeddings
    sentence_embeddings_list = []

    sentence = text
        
    # Tokenize the sentence and convert to token IDs
    tokens = tokenizer.encode(sentence, add_special_tokens=True)
    token_ids = torch.tensor(tokens).unsqueeze(0)
    
    # Create attention mask
    attention_mask = token_ids != 0
    
    # Run a forward pass through BERT
    with torch.no_grad():
        outputs = model(token_ids, attention_mask)
    encoded_layer = outputs.last_hidden_state
    
    # Calculate sentence embedding (mean of all word embeddings)
    sentence_embedding = encoded_layer.mean(dim=1)
    sentence_embeddings_list.append(sentence_embedding[0].tolist())
        

    # Convert sentence embeddings to a NumPy array
    sentence_embeddings_array = torch.tensor(sentence_embeddings_list).numpy()

    # Create a new DataFrame with original tweets, sentiment, and sentence embedding features
    embedding_df = pd.DataFrame(sentence_embeddings_array, columns=[f"feature_{i}" for i in range(sentence_embeddings_array.shape[1])])

    # Display the embedding DataFrame
    return embedding_df

In [9]:
import joblib
def preditBasedOntext(embeding):
    loaded_rf = joblib.load("./data/randomforest_bert.joblib")
    prediction  = loaded_rf.predict(embeding)
    return prediction
