In [69]:
import numpy as np
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import re

# Load the IMDB movie review dataset
reviews_train = load_files(r"C:\Users\Akachukwu Egboluche\Downloads\aclImdb\train")

# Preprocess function to clean and normalize the text data
def preprocess(text):
    # Convert bytes to string if necessary
    if isinstance(text, bytes):
        text = text.decode('utf-8')
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    
    # Return preprocessed text
    return text

# Preprocess the training data
reviews_train_preprocessed = [preprocess(text) for text in reviews_train.data]

# Create a TF-IDF representation of the preprocessed training data
vectorizer = TfidfVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(reviews_train_preprocessed)

# Get the binary labels for the training data (1 for positive, 0 for negative)
train_labels = np.array(reviews_train.target)

# Train the model on the preprocessed training data
clf = LinearSVC()
clf.fit(X_train, train_labels)

# Load the test data
reviews_test = load_files(r"C:\Users\Akachukwu Egboluche\Downloads\aclImdb\test")

# Preprocess the test data
reviews_test_preprocessed = [preprocess(text) for text in reviews_test.data]

# Create a TF-IDF representation of the preprocessed test data
X_test = vectorizer.transform(reviews_test_preprocessed)

# Get the binary labels for the test data (1 for positive, 0 for negative)
test_labels = np.array(reviews_test.target)

# Predict the labels for the test data using the trained model
pred_labels = clf.predict(X_test)

# Compute the accuracy of the model on the test data
acc = accuracy_score(test_labels, pred_labels)

print("Accuracy:", acc * 100,"%")


Accuracy: 25.0 %


In [68]:
# Load new data
new_reviews = [    "This movie was amazing, I can't wait to watch it again!",    "What a waste of time, I regret watching this movie.",    "The storyline was intriguing and kept me engaged throughout the movie.",    "This movie was a total letdown, the trailer was much better.",    "The actors did a fantastic job, I was thoroughly impressed.",    "I had high hopes for this movie, but it fell short of my expectations.",    "I couldn't take my eyes off the screen, this movie was captivating.",    "I found the plot confusing and hard to follow.",    "This movie was a rollercoaster of emotions, I laughed and cried throughout.",    "The special effects were impressive, but the plot lacked substance."]

# Preprocess the new data
new_reviews_preprocessed = [preprocess(text) for text in new_reviews]

# Transform the preprocessed new data using the vectorizer
X_new = vectorizer.transform(new_reviews_preprocessed)

# Predict the labels for the new data using the trained classifier
pred_labels_new = clf.predict(X_new)

# Print the predicted labels and corresponding reviews
for i in range(len(new_reviews)):
    review = new_reviews[i]
    label = pred_labels_new[i]
    if label == 1:
        print(review, "--> Positive")
    else:
        print(review, "--> Negative")

# Print the predicted labels 
print(pred_labels_new)


This movie was amazing, I can't wait to watch it again! --> Negative
What a waste of time, I regret watching this movie. --> Negative
The storyline was intriguing and kept me engaged throughout the movie. --> Negative
This movie was a total letdown, the trailer was much better. --> Negative
The actors did a fantastic job, I was thoroughly impressed. --> Positive
I had high hopes for this movie, but it fell short of my expectations. --> Positive
I couldn't take my eyes off the screen, this movie was captivating. --> Negative
I found the plot confusing and hard to follow. --> Negative
This movie was a rollercoaster of emotions, I laughed and cried throughout. --> Negative
The special effects were impressive, but the plot lacked substance. --> Negative
[0 0 0 0 1 1 0 0 0 0]
