<a href="https://colab.research.google.com/github/baacumen/baacumen-learning/blob/main/Comprehensive_Guide_to_Sentiment_Analysis_with_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Setting Up Your Python Environment**

In [1]:
# pip install pandas nltk scikit-learn



# **Import Libraries**

In [11]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import movie_reviews
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# **Text Preprocessing**

In [8]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# **Choosing a Sentiment Analysis Library**

In [9]:
# Load movie reviews dataset
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Split the dataset into training and testing sets
train_documents, test_documents = train_test_split(documents, test_size=0.2, random_state=42)

# Extract features using Bag-of-Words model
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform([' '.join(words) for words, _ in train_documents])
y_train = [category for _, category in train_documents]

X_test = vectorizer.transform([' '.join(words) for words, _ in test_documents])
y_test = [category for _, category in test_documents]

# Train a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

Accuracy: 0.815
Classification Report:
              precision    recall  f1-score   support

         neg       0.79      0.85      0.82       199
         pos       0.84      0.78      0.81       201

    accuracy                           0.81       400
   macro avg       0.82      0.82      0.81       400
weighted avg       0.82      0.81      0.81       400



# **Predict Sentiment for Unseen Review**

In [22]:
# New Unseen Movie Review
unseen_review = "This movie was absolutely fantastic. The plot was engaging, and the actors delivered outstanding performances."

# Preprocess the unseen review
processed_review = preprocess_text(unseen_review)

# Vectorize the processed review using the same vectorizer
X_unseen = vectorizer.transform([processed_review])

# Predict sentiment using the trained model
predicted_sentiment = classifier.predict(X_unseen)[0]

# Interpret the sentiment
sentiment_label = "Positive" if predicted_sentiment == 'pos' else "Negative"

# Display the results
print(f"Unseen Review: {unseen_review}")
print(f"Processed Review: {processed_review}")
print(f"Predicted Sentiment: {sentiment_label}")

Unseen Review: This movie was absolutely fantastic. The plot was engaging, and the actors delivered outstanding performances.
Processed Review: movie absolutely fantastic plot engaging actors delivered outstanding performances
Predicted Sentiment: Positive
