Import Libraries

In [15]:
# Core Python libraries for data handling and plotting
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# scikit-learn libraries for modeling and metrics
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# NLP tools
import nltk
import string
import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (do this only once)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\amana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\amana\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Loading Dataset

In [10]:
# Adjust the path to where your IMDB data is extracted
reviews = load_files('data/aclImdb/train', categories=['pos', 'neg'])
texts, labels = reviews.data, reviews.target  # texts: list of byte strings, labels: 1=pos, 0=neg

# Decode byte strings
texts = [doc.decode('utf-8', 'ignore') for doc in texts]


Data Preprocessing

In [16]:
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = nltk.word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    text = ' '.join(tokens)
    return text

# Demonstration of preprocessing
print("Original:\n", texts[0][:350])
print("Preprocessed:\n", preprocess_text(texts[0])[:350])

# Preprocess all reviews
texts_cleaned = [preprocess_text(doc) for doc in texts]


Original:
 Zero Day leads you to think, even re-think why two boys/young men would do what they did - commit mutual suicide via slaughtering their classmates. It captures what must be beyond a bizarre mode of being for two humans who have decided to withdraw from common civility in order to define their own/mutual world via coupled destruction.<br /><br />It 


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\amana/nltk_data'
    - 'c:\\Users\\amana\\AppData\\Local\\Programs\\Python\\Python312\\nltk_data'
    - 'c:\\Users\\amana\\AppData\\Local\\Programs\\Python\\Python312\\share\\nltk_data'
    - 'c:\\Users\\amana\\AppData\\Local\\Programs\\Python\\Python312\\lib\\nltk_data'
    - 'C:\\Users\\amana\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


Split the data

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    texts_cleaned, labels, test_size=0.2, random_state=42, stratify=labels)


NameError: name 'texts_cleaned' is not defined

Build the Model pipeline

In [14]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000)),
    ('clf', LogisticRegression(max_iter=200))
])


Train the Model

In [None]:
pipeline.fit(X_train, y_train)

Evaluate the Model

In [None]:
y_pred = pipeline.predict(X_test)

# Print standard classification metrics
print("\nClassification Report\n-----------------------")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

# Plot confusion matrix using ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Negative', 'Positive'])
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()


Feature Importance

In [None]:
vectorizer = pipeline.named_steps['tfidf']
clf = pipeline.named_steps['clf']

feature_names = np.array(vectorizer.get_feature_names_out())
sorted_coef = clf.coef_[0].argsort()

print("\nTop 20 words indicative of negative reviews:")
print(feature_names[sorted_coef[:20]])

print("\nTop 20 words indicative of positive reviews:")
print(feature_names[sorted_coef[-20:][::-1]])


Predict the New data

In [None]:
def predict_sentiment(text):
    preprocessed = preprocess_text(text)
    pred = pipeline.predict([preprocessed])[0]
    return "Positive" if pred == 1 else "Negative"

sample_text = "I absolutely loved this movie! The acting was fantastic."
print("\nSample Review:", sample_text)
print("Predicted Sentiment:", predict_sentiment(sample_text))
