In [1]:
# Data Initialization: Initialize training data with movie reviews and corresponding sentiments.

# Existing data
X_train = [
    "This was really awesome an awesome movie",
    "Great movie! I liked it a lot",
    "Happy Ending! Awesome Acting by hero",
    "loved it!",
    "Bad not up to the mark",
    "Could have been better",
    "really Disappointed by the movie",
    "The movie was just okay, nothing special.",
    "It was an average film, neither good nor bad.",
    "The plot had potential, but it didn't deliver much excitement.",
    "Decent watch but nothing to write home about.",
    "The film didn't leave much of an impression on me.",
    "Not bad, but not particularly engaging either.",
    "An ordinary movie that doesn't stand out."
]
y_train = ["positive", "positive", "positive", "positive", "negative", "negative", "negative", "average", "average", "average", "average", "average", "average", "average"]


In [2]:
# Data Display: Show the content of X_train containing movie reviews.

X_train # Reviews

['This was really awesome an awesome movie',
 'Great movie! I liked it a lot',
 'Happy Ending! Awesome Acting by hero',
 'loved it!',
 'Bad not up to the mark',
 'Could have been better',
 'really Disappointed by the movie',
 'The movie was just okay, nothing special.',
 'It was an average film, neither good nor bad.',
 "The plot had potential, but it didn't deliver much excitement.",
 'Decent watch but nothing to write home about.',
 "The film didn't leave much of an impression on me.",
 'Not bad, but not particularly engaging either.',
 "An ordinary movie that doesn't stand out."]

# Cleaning of the data

In [3]:
# Library Imports and Setup: Import necessary libraries and download NLTK stopwords.

import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


False

In [4]:
# Data Preprocessing Function: Define function for text cleaning and preprocessing.


# Data Preprocessing and Tokenization

# Function for cleaning and preprocessing text data
def get_cleaned_text(text):
    # Lowercasing
    text = text.lower()

    # Removing punctuation and non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)

    # Removing a simple set of stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [w for w in tokens if not w in stop_words]

    # Joining the tokens back into a single string
    return ' '.join(filtered_tokens)

# Input from the user

In [5]:
# Test Data Initialization: Initialize test data with new movie reviews.

X_test = [
    "it was average but I loved the plot",
    "the movie was groundbreaking and had amazing visuals",
    "I found the movie boring and too long",
    "the characters were well-developed and the story was captivating",
    "not my type of movie, didn't enjoy it at all"
]


In [6]:
# Data Cleaning: Apply cleaning and preprocessing to training and test data.

X_clean = [get_cleaned_text(i) for i in X_train]
xt_clean = [get_cleaned_text(i) for i in X_test]

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - 'C:\\Users\\azamm/nltk_data'
    - 'c:\\Users\\azamm\\AppData\\Local\\Programs\\Python\\Python311\\nltk_data'
    - 'c:\\Users\\azamm\\AppData\\Local\\Programs\\Python\\Python311\\share\\nltk_data'
    - 'c:\\Users\\azamm\\AppData\\Local\\Programs\\Python\\Python311\\lib\\nltk_data'
    - 'C:\\Users\\azamm\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


# Vectorize

In [None]:
# Import Vectorizer: Import CountVectorizer for text vectorization.

from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Vectorization of Training Data: Convert cleaned training data to numerical format using CountVectorizer.

cv = CountVectorizer()
X_vec = cv.fit_transform(X_clean).toarray()

In [None]:
# Display Vectorized Data: Show the numerical matrix of the vectorized training data.

X_vec

array([[0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 

In [None]:
# Display Feature Names: Print unique words identified in the training dataset.

print(cv.get_feature_names_out())

['acting' 'average' 'awesome' 'bad' 'better' 'could' 'decent' 'deliver'
 'didnt' 'disappointed' 'doesnt' 'either' 'ending' 'engaging' 'excitement'
 'film' 'good' 'great' 'happy' 'hero' 'home' 'impression' 'leave' 'liked'
 'lot' 'loved' 'mark' 'movie' 'much' 'neither' 'nothing' 'okay' 'ordinary'
 'particularly' 'plot' 'potential' 'really' 'special' 'stand' 'watch'
 'write']


In [None]:
# Vectorization of Test Data: Convert cleaned test data to numerical format using the same vectorizer.

Xt_vec = cv.transform(xt_clean).toarray()

In [None]:
# Display Vectorized Test Data: Show the numerical matrix of the vectorized test data.

Xt_vec

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

# Multinomial Naive Bayes

In [None]:
# Import Naive Bayes Model: Import MultinomialNB classifier.

from sklearn.naive_bayes import MultinomialNB

In [None]:
# Model Initialization: Initialize the Multinomial Naive Bayes model.

mn = MultinomialNB()

In [None]:
# Model Training: Train the Naive Bayes model with training data.

mn.fit(X_vec, y_train)

In [None]:
# Model Prediction: Predict sentiments for test data using the trained model.

predictions = mn.predict(Xt_vec)

In [None]:
# Sentiment Analysis Result Aggregation: Count occurrences of each sentiment in predictions.

from collections import Counter
sentiment_counts = Counter(predictions)
most_common_sentiment = sentiment_counts.most_common(1)[0][0]

In [None]:
# Display Most Common Sentiment: Show the most common sentiment among predictions.

most_common_sentiment

'average'