# Training a machine learning model to classify movie reviews as positive or negative



###Step 1: Install required libraries



In [1]:
pip install numpy pandas scikit-learn nltk




###Step 2: Import Necessary Libraries

In [2]:
import numpy as np
import pandas as pd
import nltk # NLP library for Text Dataset and Sentiment Analysis
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import movie_reviews
import random


###Step 3: Loading NLTK Movie Review Dataset which contains labeled positive and negative reviews

In [9]:
# Download dataset
nltk.download('movie_reviews')

# Load movie reviews dataset:
""" Iterating through the categories and for each category: positive or negative, iterate through the list of file IDs that belong to
each category. So each file is a movie review and using list comprehension, we store each review as a tuple
where the first element is the list of words from the review and the second element is the review's category for positive or negative sentiment
"""
docs = [(list(movie_reviews.words(fileid)), category)
        for category in movie_reviews.categories()
        for fileid in movie_reviews.fileids(category)]

# Shuffle data to avoid bias
random.shuffle(docs)

# Convert to DataFrame
df = pd.DataFrame(docs, columns=['review', 'sentiment'])

# Join words to form text reviews: for all the words we join them into a single string
df['review'] = [' '.join(words) for words in df['review']]

# Convert labels to binary: 1 for positive, 0 for negative
df['sentiment'] = df['sentiment'].map({'pos': 1, 'neg': 0})

# Print dataset sample
print(df.head(10))


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


                                              review  sentiment
0  a backdrop of new year ' s eve in 1981 would s...          0
1  robin williams has the rarest of gifts : the a...          0
2  david schwimmer ( from the television series "...          0
3  the tagline for this film is : " some houses a...          0
4  kids today , they don ' t just want to see hea...          0
5  at one point in this movie there is a staging ...          0
6  sometimes a movie comes along that falls somew...          1
7  do you want to know the truth about cats and d...          1
8  > from the man who presented us with henry : t...          1
9  anastasia contains something that has been lac...          1


###Step 4: Split data into training and test sets

In [4]:
# Define features as (X) and labels as (y)
X = df['review']
y = df['sentiment']

# Split into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training Samples: {len(X_train)}")
print(f"Testing Samples: {len(X_test)}")


Training Samples: 1600
Testing Samples: 400


###Step 5: Create a Text Processing and Classification Pipeline
Making a pipelines helps to join multiple steps together and makes model deployment easier as it applies TF-IDF transformation automatically to the training data and then we can train the model. TF-IDF Vectorizer converts text into numerical form and removing English stop words like "the" "and" to improve accuracy. I've used a Naive Bayes Classifier as it's fast, works well with text and good for small datasets.

In [5]:
# Create a pipeline: TF-IDF Vectorizer + Naive Bayes Classifier
# TF-IDF Vectorizer will convert text to numerical values and since this is a classification problem
# We use Naive Bayes Classifier
model = make_pipeline(TfidfVectorizer(stop_words='english'), MultinomialNB())

# Train the model
model.fit(X_train, y_train)


###Step 6: Model Evaluation
Using the accuracy score metric, we can see around 84% accuracy of the predicted samples against the test samples.

In [6]:
# Predict on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}") #formatting to 2 decimal places

# Print classification report
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))


Model Accuracy: 0.84
              precision    recall  f1-score   support

    Negative       0.80      0.90      0.84       192
    Positive       0.89      0.79      0.84       208

    accuracy                           0.84       400
   macro avg       0.84      0.84      0.84       400
weighted avg       0.85      0.84      0.84       400



###Step 7: Testing some sample reviews

In [7]:
# Sample reviews to test
sample_reviews = [
    "This movie was fantastic! The characters were well-developed, and the plot was engaging.",
    "I hated this movie. It was so boring and a complete waste of time.",
    "An average movie, not too bad but not great either.",
]

# Predict sentiment
predictions = model.predict(sample_reviews)

# Print results
for review, sentiment in zip(sample_reviews, predictions):
    print(f"Review: {review}\nSentiment: {'Positive' if sentiment == 1 else 'Negative'}\n")


Review: This movie was fantastic! The characters were well-developed, and the plot was engaging.
Sentiment: Positive

Review: I hated this movie. It was so boring and a complete waste of time.
Sentiment: Negative

Review: An average movie, not too bad but not great either.
Sentiment: Negative

