<a href="https://colab.research.google.com/github/bidishaaiml/COE_week12/blob/main/movie_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Python program to analyze the sentiment of movie reviews.**

In [21]:
!pip install textblob



In [22]:
from textblob import TextBlob

# Sample movie reviews
movie_reviews = [
    "The movie was fantastic! The plot was gripping and the characters were well-developed.",
    "I didn't like the movie at all. It was too slow and the acting was poor.",
    "It was an average movie, nothing special but not bad either.",
    "Absolutely terrible! I wasted two hours of my life.",
    "A masterpiece. The director did an incredible job.",
]

# Function to analyze sentiment using TextBlob
def analyze_sentiment(reviews):
    results = []
    for review in reviews:
        blob = TextBlob(review)
        sentiment = blob.sentiment
        sentiment_label = 'Positive' if sentiment.polarity > 0 else 'Negative' if sentiment.polarity < 0 else 'Neutral'
        results.append({
            'review': review,
            'polarity': sentiment.polarity,
            'subjectivity': sentiment.subjectivity,
            'sentiment': sentiment_label
        })
    return results

# Analyzing sentiment
sentiment_results = analyze_sentiment(movie_reviews)

# Displaying the results
for i, result in enumerate(sentiment_results):
    print(f"Review {i+1}:")
    print(f"Text: {result['review']}")
    print(f"Sentiment: {result['sentiment']}")
    print(f"Polarity: {result['polarity']}, Subjectivity: {result['subjectivity']}")
    print()



Review 1:
Text: The movie was fantastic! The plot was gripping and the characters were well-developed.
Sentiment: Positive
Polarity: 0.5, Subjectivity: 0.95

Review 2:
Text: I didn't like the movie at all. It was too slow and the acting was poor.
Sentiment: Negative
Polarity: -0.23333333333333336, Subjectivity: 0.3333333333333333

Review 3:
Text: It was an average movie, nothing special but not bad either.
Sentiment: Positive
Polarity: 0.1857142857142857, Subjectivity: 0.546031746031746

Review 4:
Text: Absolutely terrible! I wasted two hours of my life.
Sentiment: Negative
Polarity: -0.6, Subjectivity: 0.5

Review 5:
Text: A masterpiece. The director did an incredible job.
Sentiment: Positive
Polarity: 0.9, Subjectivity: 0.9



In [23]:
!pip install nltk
!pip install pandas
!pip install scikit-learn




In [24]:
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import pandas as pd

# Downloading required NLTK data
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initializing VADER sentiment analyzer, stop words, and lemmatizer
sia = SentimentIntensityAnalyzer()
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Converting text to lowercase
    text = text.lower()

    # Removing HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Removing URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Removing punctuation and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Removing stop words and lemmatize tokens
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    # Joining tokens back into a single string
    cleaned_text = ' '.join(cleaned_tokens)

    return cleaned_text

def analyze_sentiment(review):
    # Preprocessing the review text
    cleaned_review = preprocess_text(review)

    # Analyzing the sentiment of the cleaned review
    sentiment = sia.polarity_scores(cleaned_review)

    # Determining sentiment as positive, negative, or neutral
    if sentiment['compound'] >= 0.05:
        return "Positive"
    elif sentiment['compound'] <= -0.05:
        return "Negative"
    else:
        return "Neutral"

# Sample movie reviews
reviews = [
    "I absolutely loved this movie! The acting was brilliant and the storyline was captivating.",
    "The movie was okay, but it lacked the depth I was hoping for.",
    "I hated this movie. It was a complete waste of time.",
    "It was a decent movie, but nothing spectacular.",
    "One of the best movies I've seen in a long time!"
]

# Creating a DataFrame to store reviews and their sentiments
df = pd.DataFrame(reviews, columns=['Review'])
df['Sentiment'] = df['Review'].apply(analyze_sentiment)

# Displaying the results
print(df)


                                              Review Sentiment
0  I absolutely loved this movie! The acting was ...  Positive
1  The movie was okay, but it lacked the depth I ...  Positive
2  I hated this movie. It was a complete waste of...  Negative
3    It was a decent movie, but nothing spectacular.   Neutral
4   One of the best movies I've seen in a long time!  Positive


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**Detailed Preprocessing Steps:**

* Lowercasing: Converts all characters in the text to lowercase to ensure uniformity.


* Removing HTML Tags: Strips out any HTML tags that may be present in the review text using regular expressions.


* Removing URLs: Cleans out any URLs that may be present using a regular expression pattern.

* Removing Punctuation and Special Characters: This step removes all non-alphabetic characters, including punctuation, numbers, and special symbols.

* Tokenization: Splits the text into individual words (tokens), which allows for more granular processing.

* Stop Word Removal: Removes common English words that do not contribute to the sentiment of the text (like "the," "is," "in," etc.), which helps focus on the more meaningful words.

* Lemmatization: Reduces words to their base or root form (e.g., "running" to "run"), which helps in grouping similar words and reducing noise.

* Joining Tokens: The cleaned tokens are joined back into a single string after preprocessing to make them ready for sentiment analysis.

* Sentiment Analysis:
After preprocessing, the cleaned text is passed to the VADER sentiment analyzer.
The analyzer computes sentiment scores and determines whether the review is positive, negative, or neutral based on the compound score.




Output:
The program displays the original reviews along with their calculated sentiments in a DataFrame. This approach helps ensure that the sentiment analysis is based on cleaned and normalized data, leading to more accurate results.


This detailed preprocessing pipeline is crucial for handling real-time data, especially if the input text is noisy or unstructured. It can be adapted and extended for more complex scenarios, such as analyzing reviews in different languages or dealing with domain-specific terminology.

# **Using Hugging Face Transformers for Sentiment Analysis**

In [25]:
!pip install transformers



In [26]:
from transformers import pipeline

# Initializing the sentiment analysis pipeline
sentiment_analyzer = pipeline('sentiment-analysis')

def analyze_sentiment(review):
    # Using the AI model to predict sentiment
    result = sentiment_analyzer(review)[0]

    # Extracting sentiment label and score
    sentiment = result['label']
    score = result['score']

    return sentiment, score

# Sampling movie reviews
reviews = [
    "I absolutely loved this movie! The acting was brilliant and the storyline was captivating.",
    "The movie was okay, but it lacked the depth I was hoping for.",
    "I hated this movie. It was a complete waste of time.",
    "It was a decent movie, but nothing spectacular.",
    "One of the best movies I've seen in a long time!"
]

# Analyzing sentiment for each review
for review in reviews:
    sentiment, score = analyze_sentiment(review)
    print(f"Review: {review}")
    print(f"Sentiment: {sentiment} (Score: {score:.2f})")
    print("----")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Review: I absolutely loved this movie! The acting was brilliant and the storyline was captivating.
Sentiment: POSITIVE (Score: 1.00)
----
Review: The movie was okay, but it lacked the depth I was hoping for.
Sentiment: NEGATIVE (Score: 1.00)
----
Review: I hated this movie. It was a complete waste of time.
Sentiment: NEGATIVE (Score: 1.00)
----
Review: It was a decent movie, but nothing spectacular.
Sentiment: NEGATIVE (Score: 0.99)
----
Review: One of the best movies I've seen in a long time!
Sentiment: POSITIVE (Score: 1.00)
----


Explanation:

Hugging Face Pipeline:

The pipeline function provides an easy-to-use interface for various NLP tasks, including sentiment analysis.

By default, it uses a pre-trained model such as distilbert-base-uncased-finetuned-sst-2-english, which is a distilled version of BERT fine-tuned on the SST-2 dataset for sentiment classification.
Sentiment Prediction:

The AI model outputs a label (either "POSITIVE" or "NEGATIVE") and a score (confidence level for the prediction).
The score indicates how confident the model is about its prediction.
Sample Output:

Each review is analyzed, and the sentiment along with the confidence score is printed.

In [None]:
pip install azure-ai-textanalytics==5.2.0

In [None]:
# This requires environment variables named "LANGUAGE_KEY" and "LANGUAGE_ENDPOINT"
language_key = os.environ.get('LANGUAGE_KEY')
language_endpoint = os.environ.get('LANGUAGE_ENDPOINT')

from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential

# Authenticating the client using the key and endpoint
def authenticate_client():
    ta_credential = AzureKeyCredential(language_key)
    text_analytics_client = TextAnalyticsClient(
            endpoint=language_endpoint,
            credential=ta_credential)
    return text_analytics_client

client = authenticate_client()

# Example method for detecting sentiment and opinions in text
def sentiment_analysis_with_opinion_mining_example(client):

    documents = [
        "The food and service were unacceptable. The concierge was nice, however."
    ]

    result = client.analyze_sentiment(documents, show_opinion_mining=True)
    doc_result = [doc for doc in result if not doc.is_error]

    positive_reviews = [doc for doc in doc_result if doc.sentiment == "positive"]
    negative_reviews = [doc for doc in doc_result if doc.sentiment == "negative"]

    positive_mined_opinions = []
    mixed_mined_opinions = []
    negative_mined_opinions = []

    for document in doc_result:
        print("Document Sentiment: {}".format(document.sentiment))
        print("Overall scores: positive={0:.2f}; neutral={1:.2f}; negative={2:.2f} \n".format(
            document.confidence_scores.positive,
            document.confidence_scores.neutral,
            document.confidence_scores.negative,
        ))
        for sentence in document.sentences:
            print("Sentence: {}".format(sentence.text))
            print("Sentence sentiment: {}".format(sentence.sentiment))
            print("Sentence score:\nPositive={0:.2f}\nNeutral={1:.2f}\nNegative={2:.2f}\n".format(
                sentence.confidence_scores.positive,
                sentence.confidence_scores.neutral,
                sentence.confidence_scores.negative,
            ))
            for mined_opinion in sentence.mined_opinions:
                target = mined_opinion.target
                print("......'{}' target '{}'".format(target.sentiment, target.text))
                print("......Target score:\n......Positive={0:.2f}\n......Negative={1:.2f}\n".format(
                    target.confidence_scores.positive,
                    target.confidence_scores.negative,
                ))
                for assessment in mined_opinion.assessments:
                    print("......'{}' assessment '{}'".format(assessment.sentiment, assessment.text))
                    print("......Assessment score:\n......Positive={0:.2f}\n......Negative={1:.2f}\n".format(
                        assessment.confidence_scores.positive,
                        assessment.confidence_scores.negative,
                    ))
            print("\n")
        print("\n")

sentiment_analysis_with_opinion_mining_example(client)