## HARD_LAB CODE

In [17]:
import re
import numpy as np
import nltk
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function to preprocess file
def text_preprocessing(text):
    # Convert tweets to lower case
    text = text.lower()

    # Remove URLs and links
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove punctuation marks
    text = re.sub(r'[^\w\s]', '', text)

    # Remove HTML and XML tags
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()

    # Tokenize the tweets into words
    words = word_tokenize(text)

    # Initialize WordNet Lemmatizer
    lem = WordNetLemmatizer()

    # Remove stop words and lemmatize each word
    stops = set(stopwords.words('english'))
    words_filter = [lem.lemmatize(w) for w in words if w not in stops]

    # Return the processed words as a single string (needed for CountVectorizer)
    return ' '.join(words_filter)

# Function to load tweets from a file and preprocess them
def load_tweets(file_name):
    with open(file_name, 'r', encoding='utf-8') as file:
        tweets = file.readlines()
    # Preprocess each tweet and return them as a list of strings
    return [text_preprocessing(tweet) for tweet in tweets]

# Function to create word frequency dictionary
def word_frequency_dict(tweets):
    word_frequency = defaultdict(int)
    for tweet in tweets:
        words = tweet.split()  # Now `tweet` is a string, so splitting it works
        for word in words:
            word_frequency[word] += 1
    return dict(word_frequency)

# Load training data (positive and negative tweets)
print("Loading training data...")
train_Pos_tweets = load_tweets("train_Pos.en")
train_Neg_tweets = load_tweets("train_Neg.en")

# Create word frequency dictionaries for both positive and negative tweets
print("Creating word frequency dictionaries...")
pos_word_freq = word_frequency_dict(train_Pos_tweets)
neg_word_freq = word_frequency_dict(train_Neg_tweets)

# Combine all training tweets and create labels (1 for positive, 0 for negative)
all_train_tweets = train_Pos_tweets + train_Neg_tweets
train_labels = np.array([1] * len(train_Pos_tweets) + [0] * len(train_Neg_tweets))

# Convert tweets to numerical features using CountVectorizer with ngram
print("Converting tweets to numerical features...")
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(all_train_tweets)

# Get vocabulary size
vocabulary = vectorizer.get_feature_names_out()
print(f"Vocabulary size: {len(vocabulary)}")

# Train the Naive Bayes classifier
print("Training the classifier...")
classifier = MultinomialNB()
classifier.fit(X_train, train_labels)

# Load test data (positive and negative tweets)
print("Loading testing data...")
test_Pos_tweets = load_tweets("ttestPos.en")
test_Neg_tweets = load_tweets("ttestNeg.en")

# Combine all test tweets and create labels
all_test_tweets = test_Pos_tweets + test_Neg_tweets
test_labels = np.array([1] * len(test_Pos_tweets) + [0] * len(test_Neg_tweets))

# Transform test data using the trained vectorizer
X_test = vectorizer.transform(all_test_tweets)

# Make predictions using the classifier
predictions = classifier.predict(X_test)

# Calculate accuracy of the model
accuracy = accuracy_score(test_labels, predictions)
print(f"\nAccuracy: {accuracy:.2%}")

# Function to predict the sentiment of a new tweet
def predict_new_tweet_sentiment(new_tweet):
    # Preprocess the new tweet
    processed_tweet = text_preprocessing(new_tweet)
    # Vectorize the preprocessed tweet
    tweet_vector = vectorizer.transform([processed_tweet])
    # Predict the sentiment
    prediction = classifier.predict(tweet_vector)
    return "Positive" if prediction[0] == 1 else "Negative"

# Interactive loop for predicting the sentiment of new tweets
print("\nEnter new tweet for sentiment analysis:")
print("Type 'quit' to exit")

while True:
    tweet_input = input("\nEnter a tweet to analyze: ")
    if tweet_input.lower() == 'quit':
        print("Thank you for using the tweet sentiment analyzer.")
        break
    sentiment = predict_new_tweet_sentiment(tweet_input)
    print(f"Sentiment is {sentiment}")

[nltk_data] Downloading package punkt to /Users/aakanksha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aakanksha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aakanksha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loading training data...
Creating word frequency dictionaries...
Converting tweets to numerical features...
Vocabulary size: 466257
Training the classifier...
Loading testing data...

Accuracy: 77.68%

Enter new tweet for sentiment analysis:
Type 'quit' to exit



Enter a tweet to analyze:  This is so innovative and creative idea.


Sentiment is Positive



Enter a tweet to analyze:  I hate this idea.


Sentiment is Negative



Enter a tweet to analyze:  quit


Thank you for using the tweet sentiment analyzer.
