In [55]:
pip install nltk


Note: you may need to restart the kernel to use updated packages.


In [56]:
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from collections import defaultdict
import re
import random
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akulk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [57]:
column_names = ['sentiment', 'id', 'date', 'query', 'user', 'tweet']

# list type to track total counts of negative (index 0) and positive (index 1) words


In [58]:
sentimentData = pd.read_csv('training.1600000.processed.noemoticon.csv', delimiter=',', encoding='latin-1', names=column_names,
    usecols=['sentiment', 'id', 'tweet'])
# turn sentiment into a value between -1 and 1.
sentimentData['sentiment'] = (sentimentData['sentiment'].astype(int) - 2) / 2


In [94]:
# clean through tweets and build a frequency list of the most common words in tweets, as well as their sentiment value aggregates
from collections import defaultdict
import re

negation_words = {"not", "no", "never", "n't", "cannot", "don't", "didn't", "doesn't", "won't", "can't", "shouldn't"}

def process_sentiment_data_prev(sentimentData):
    """
    Processes tweet sentiment data to build word sentiment statistics. (NO negation filtering, or stopword filtering, or least frequent removal)
    
    Parameters:
        sentimentData (pd.DataFrame): DataFrame with columns 'sentiment' and 'tweet'
    
    Returns:
        word_stats (defaultdict): Dictionary mapping word → [neg_count, pos_count, sentiment_sum]
        counts (list): [total_neg_tokens, total_pos_tokens, total_tokens]
    """
    word_stats = defaultdict(lambda: [0, 0, 0])  # [neg_count, pos_count, sentiment_sum]
    counts = [0, 0, 0]  # total negative, total positive, total tokens
    
    pattern = re.compile(r"\w+|[^\w\s]")  # match words or punctuation

    for row in sentimentData.itertuples(index=False):
        sentiment = int(row.sentiment)
        sentiment_index = (sentiment + 1) // 2
        tweet = row.tweet.lower()
        tokens = pattern.findall(tweet)

        # Local counters to reduce write contention
        local_neg = 0
        local_pos = 0
        local_total = 0

        for token in tokens:
            stats = word_stats[token]
            stats[sentiment_index] += 1
            stats[2] += sentiment

            if sentiment_index == 0:
                local_neg += 1
            else:
                local_pos += 1
            local_total += 1

        counts[0] += local_neg
        counts[1] += local_pos
        counts[2] += local_total

    return word_stats, counts

def process_sentiment_data_1(sentimentData):
    """
    Processes tweet sentiment data to build word sentiment statistics with negation handling.
    
    Parameters:
        sentimentData (pd.DataFrame): DataFrame with columns 'sentiment' and 'tweet'
    
    Returns:
        word_stats (defaultdict): Dictionary mapping word → [neg_count, pos_count, sentiment_sum]
        counts (list): [total_neg_tokens, total_pos_tokens, total_tokens]
    """
    word_stats = defaultdict(lambda: [0, 0, 0])  # [neg_count, pos_count, sentiment_sum]
    counts = [0, 0, 0]  # total negative, total positive, total tokens
    
    pattern = re.compile(r"\w+|[^\w\s]")  # match words or punctuation

    for row in sentimentData.itertuples(index=False):
        sentiment = int(row.sentiment)
        sentiment_index = (sentiment + 1) // 2
        tweet = row.tweet.lower()
        tokens = pattern.findall(tweet)

        # Negation handling
        negated = False
        negated_tokens = []

        for token in tokens:
            if token in negation_words:
                negated = True
                negated_tokens.append(token)
                continue

            if token in {".", "!", "?", ";"}:
                negated = False

            if negated and re.match(r"\w+", token):  # only apply to word tokens
                token = f"NOT_{token}"

            negated_tokens.append(token)

        # Local counters
        local_neg = 0
        local_pos = 0
        local_total = 0

        for token in negated_tokens:
            stats = word_stats[token]
            stats[sentiment_index] += 1
            stats[2] += sentiment

            if sentiment_index == 0:
                local_neg += 1
            else:
                local_pos += 1
            local_total += 1

        counts[0] += local_neg
        counts[1] += local_pos
        counts[2] += local_total
    
    return word_stats, counts

def process_sentiment_data(sentimentData):
    """
    Processes tweet sentiment data to build word sentiment statistics with negation handling.
    
    Parameters:
        sentimentData (pd.DataFrame): DataFrame with columns 'sentiment' and 'tweet'
    
    Returns:
        word_stats (defaultdict): Dictionary mapping word → [neg_count, pos_count, sentiment_sum]
        counts (list): [total_neg_tokens, total_pos_tokens, total_tokens]
    """
    word_stats = defaultdict(lambda: [0, 0, 0])  # [neg_count, pos_count, sentiment_sum]
    counts = [0, 0, 0]  # total negative, total positive, total tokens
    
    pattern = re.compile(r"\w+|[^\w\s]")  # match words or punctuation

    for row in sentimentData.itertuples(index=False):
        sentiment = int(row.sentiment)
        sentiment_index = (sentiment + 1) // 2
        tweet = row.tweet.lower()
        tokens = pattern.findall(tweet)

        # Negation handling
        negated = False
        negated_tokens = []

        for token in tokens:
            if token in negation_words:
                negated = True
                negated_tokens.append(token)
                continue

            if token in {".", "!", "?", ";"}:
                negated = False

            if negated and re.match(r"\w+", token):  # only apply to word tokens
                token = f"NOT_{token}"

            negated_tokens.append(token)

        # Local counters
        local_neg = 0
        local_pos = 0
        local_total = 0

        for token in negated_tokens:
            stats = word_stats[token]
            stats[sentiment_index] += 1
            stats[2] += sentiment

            if sentiment_index == 0:
                local_neg += 1
            else:
                local_pos += 1
            local_total += 1

        counts[0] += local_neg
        counts[1] += local_pos
        counts[2] += local_total
    word_stats = {word: stats for word, stats in word_stats.items() if stats[0] + stats[1] >= 5}
    return word_stats, counts


def process_sentiment_data_withoutstopwords(sentimentData):
    
    """
    Processes tweet sentiment data to build word sentiment statistics.
    
    Parameters:
        sentimentData (pd.DataFrame): DataFrame with columns 'sentiment' and 'tweet'
    
    Returns:
        word_stats (defaultdict): Dictionary mapping word → [neg_count, pos_count, sentiment_sum]
        counts (list): [total_neg_tokens, total_pos_tokens, total_tokens]
    """
    stop_words = set(stopwords.words('english'))
    word_stats = defaultdict(lambda: [0, 0, 0])  # [neg_count, pos_count, sentiment_sum]
    counts = [0, 0, 0]  # total negative, total positive, total tokens
    
    pattern = re.compile(r"\w+|[^\w\s]")  # match words or punctuation

    for row in sentimentData.itertuples(index=False):
        sentiment = int(row.sentiment)
        sentiment_index = (sentiment + 1) // 2
        tweet = row.tweet.lower()
        tokens = pattern.findall(tweet)

        # Local counters to reduce write contention
        local_neg = 0
        local_pos = 0
        local_total = 0

        for token in tokens:
            if token in stop_words:
                continue
            stats = word_stats[token]
            stats[sentiment_index] += 1
            stats[2] += sentiment

            if sentiment_index == 0:
                local_neg += 1
            else:
                local_pos += 1
            local_total += 1

        counts[0] += local_neg
        counts[1] += local_pos
        counts[2] += local_total

    return word_stats, counts

The Below code will then take these values stored in our word_stats dictionary, and transform it into a dataframe, allowing us easier lookup of data for calculating the sentiment value of a given tweet using machine prediction

In [89]:
# A.S.F or Aggregate Sentiment Frequencies values dataframe, including all words in our tweet library, their counts, and aggregate sentiment values.
word_stats_ws, counts_ws = process_sentiment_data_withoutstopwords(sentimentData)
word_stats, counts = process_sentiment_data(sentimentData)

def toDataFrame(wordStats):
    asfValues = pd.DataFrame(
        [(word, freq_neg, freq_pos, sentiment_sum) for word, (freq_neg, freq_pos, sentiment_sum) in wordStats.items()],
        columns=['Word', 'frequency_negative', 'frequency_positive', 'average_sentiment']
    )
    # average all sentiment aggregates to become average of the word count and sentiment tracked
    asfValues['average_sentiment'] = asfValues['average_sentiment'] / (asfValues['frequency_negative'] + asfValues['frequency_positive'])

    #allow lookups by word for easier access to data values
    asfValues.set_index('Word', inplace=True)

    return asfValues

asfValues = toDataFrame(word_stats)
asfValues_ws = toDataFrame(word_stats_ws)

In [90]:
word_stats1, counts1 = process_sentiment_data_1(sentimentData)


In [95]:
word_statsprev, countsprev = process_sentiment_data_prev(sentimentData)

In [96]:
asfValues_1 = toDataFrame(word_stats1)
asfValues_prev = toDataFrame(word_statsprev)

In [None]:
def calculate_word_LLR(word):
    if word in asfValues.index:
            positive_count = asfValues.loc[word, 'frequency_positive']
            negative_count = asfValues.loc[word, 'frequency_negative']        
    else:
            positive_count = 1  #smoothing constants if word doesn't appear at all in the corpus
            negative_count = 1
    
    alpha = 1  # smoothing constant
    V = len(asfValues)  # vocabulary size

    P_w_givenGood = (positive_count + alpha)/ (counts[1] + V)
    P_w_givenBad = (negative_count + alpha)/ (counts[0] + V)
    LLR = np.log(P_w_givenGood/ P_w_givenBad)
    biasing_constant = np.log(negative_count + positive_count)
    value = biasing_constant * LLR
    return value

def calculate_word_LLR_prev(word):
    if word in asfValues.index:
            positive_count = asfValues_prev.loc[word, 'frequency_positive']
            negative_count = asfValues_prev.loc[word, 'frequency_negative']        
    else:
            positive_count = 1  #smoothing constants if word doesn't appear at all in the corpus
            negative_count = 1
    
    alpha = 1  # smoothing constant
    V = len(asfValues_prev)  # vocabulary size

    P_w_givenGood = (positive_count + alpha)/ (counts[1] + V)
    P_w_givenBad = (negative_count + alpha)/ (counts[0] + V)
    LLR = np.log(P_w_givenGood/ P_w_givenBad)
    biasing_constant = np.log(negative_count + positive_count)
    value = biasing_constant * LLR
    return value


def calculate_word_LLR1(word):
    if word in asfValues_1.index:
            positive_count = asfValues.loc[word, 'frequency_positive']
            negative_count = asfValues.loc[word, 'frequency_negative']        
    else:
            positive_count = 1  #smoothing constants if word doesn't appear at all in the corpus
            negative_count = 1
    
    alpha = 1  # smoothing constant
    V = len(asfValues_1)  # vocabulary size

    P_w_givenGood = (positive_count + alpha)/ (counts[1] + V)
    P_w_givenBad = (negative_count + alpha)/ (counts[0] + V)
    LLR = np.log(P_w_givenGood/ P_w_givenBad)
    biasing_constant = np.log(negative_count + positive_count)
    value = biasing_constant * LLR
    return value

def calculate_word_LLR_ws(word):
    if word in asfValues.index:
            positive_count = asfValues_ws.loc[word, 'frequency_positive']
            negative_count = asfValues_ws.loc[word, 'frequency_negative']        
    else:
            positive_count = 1  #smoothing constants if word doesn't appear at all in the corpus
            negative_count = 1
    
    alpha = 1  # smoothing constant
    V = len(asfValues_ws)  # vocabulary size
    P_w_givenGood = (positive_count + alpha)/ (counts[1] + V)
    P_w_givenBad = (negative_count + alpha)/ (counts[0] + V)
    LLR = np.log(P_w_givenGood/ P_w_givenBad)
    biasing_constant = np.log(negative_count + positive_count)
    value = biasing_constant * LLR
    return value

def calculate_sentiment_probability_ws(sentence):
    pattern = re.compile(r"\w+|[^\w\s]")
    words = pattern.findall(sentence.lower())
    total_llr = 0
    for word in words:
          stop_words = set(stopwords.words('english'))
          if word in stop_words:
               continue
          LLR = calculate_word_LLR_ws(word)
          total_llr += LLR
    
    return total_llr

def calculate_sentiment_probability_1(sentence):
    pattern = re.compile(r"\w+|[^\w\s]")
    words = pattern.findall(sentence.lower())
    total_llr = 0
    for word in words:
          LLR = calculate_word_LLR1(word)
          total_llr += LLR
    
    return total_llr

def calculate_sentiment_probability_prev(sentence):
    pattern = re.compile(r"\w+|[^\w\s]")
    words = pattern.findall(sentence.lower())
    total_llr = 0
    for word in words:
          LLR = calculate_word_LLR_prev(word)
          total_llr += LLR
    
    return total_llr

def calculate_sentiment_probability(sentence):
    pattern = re.compile(r"\w+|[^\w\s]")
    words = pattern.findall(sentence.lower())
    total_llr = 0
    for word in words:
          LLR = calculate_word_LLR(word)
          total_llr += LLR
    
    return total_llr

def evaluate_random_baseline(df):
    correct = 0
    total = len(df)

    predictions = []
    actuals = []

    for row in df.itertuples(index=False):
        predicted_sentiment = random.choice([-1, 1])  # Random guess
        actual_sentiment = int(row.sentiment)

        predictions.append(predicted_sentiment)
        actuals.append(actual_sentiment)

        if predicted_sentiment == actual_sentiment:
            correct += 1

    accuracy = correct / total
    print(f"Random Baseline Accuracy: {accuracy:.2%}")
    return predictions, actuals

def evaluate_llr_model_ws(df):
    correct = 0
    total = len(df)

    predictions = []
    actuals = []

    for row in df.itertuples(index=False):
        llr_score = calculate_sentiment_probability_ws(row.tweet)
        predicted_sentiment = 1 if llr_score > 0 else -1  # LLR > 0 → positive, else negative
        actual_sentiment = int(row.sentiment)

        predictions.append(predicted_sentiment)
        actuals.append(actual_sentiment)

        if predicted_sentiment == actual_sentiment:
            correct += 1

    accuracy = correct / total
    print(f"Excluding Stopword Model Predicted Accuracy: {accuracy:.2%}")
    return predictions, actuals

def evaluate_llr_model_prev(df):
    correct = 0
    total = len(df)

    predictions = []
    actuals = []

    for row in df.itertuples(index=False):
        llr_score = calculate_sentiment_probability_prev(row.tweet)
        predicted_sentiment = 1 if llr_score > 0 else -1  # LLR > 0 → positive, else negative
        actual_sentiment = int(row.sentiment)

        predictions.append(predicted_sentiment)
        actuals.append(actual_sentiment)

        if predicted_sentiment == actual_sentiment:
            correct += 1

    accuracy = correct / total
    print(f"Milestone 3 Model Accuracy: {accuracy:.2%}")
    return predictions, actuals

def evaluate_llr_model_1(df):
    correct = 0
    total = len(df)

    predictions = []
    actuals = []

    for row in df.itertuples(index=False):
        llr_score = calculate_sentiment_probability(row.tweet)
        predicted_sentiment = 1 if llr_score > 0 else -1  # LLR > 0 → positive, else negative
        actual_sentiment = int(row.sentiment)

        predictions.append(predicted_sentiment)
        actuals.append(actual_sentiment)

        if predicted_sentiment == actual_sentiment:
            correct += 1

    accuracy = correct / total
    print(f"Normal Model Predicted Accuracy with Negative Weighting: {accuracy:.2%}")
    return predictions, actuals

def evaluate_llr_model(df):
    correct = 0
    total = len(df)

    predictions = []
    actuals = []

    for row in df.itertuples(index=False):
        llr_score = calculate_sentiment_probability(row.tweet)
        predicted_sentiment = 1 if llr_score > 0 else -1  # LLR > 0 → positive, else negative
        actual_sentiment = int(row.sentiment)

        predictions.append(predicted_sentiment)
        actuals.append(actual_sentiment)

        if predicted_sentiment == actual_sentiment:
            correct += 1

    accuracy = correct / total
    print(f"Normal Model Predicted Accuracy with Negative Weighting, and removal of least frequent words: {accuracy:.2%}")
    return predictions, actuals



In [None]:
# test the data against tweets in the dataset
negative_tweets = sentimentData[sentimentData['sentiment'] == -1].sample(n=5000, random_state=6)
positive_tweets = sentimentData[sentimentData['sentiment'] == 1].sample(n=5000, random_state=6)
balanced_sample = pd.concat([negative_tweets, positive_tweets]).reset_index(drop=True)

evaluate_llr_model_prev(balanced_sample)
evaluate_llr_model(balanced_sample)
evaluate_llr_model_1(balanced_sample)
evaluate_llr_model_ws(balanced_sample)
evaluate_random_baseline(balanced_sample)

Milestone 3 Model Accuracy: 75.57%
Normal Model Predicted Accuracy with Negative Weighting, and removal of least frequent words: 76.52%
Normal Model Predicted Accuracy with Negative Weighting: 76.52%
