In [40]:
import pandas as pd
import chardet
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [41]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/balubabu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [42]:
#Detecting the encoding of the given file
with open('zacks_arguments.csv', 'rb') as file:
    sample = file.read(10000)  # Read first 10000 bytes as a sample
    detected = chardet.detect(sample)
    encoding = detected['encoding']
df = pd.read_csv('zacks_arguments.csv', encoding=encoding)
#Reading the files into the dataframe
zacks_args_df = pd.read_csv('zacks_arguments.csv', encoding=encoding)

In [43]:
#Reading the positive and negative word lists into the dataframe 
column_positive_words = ['Postive_Words']
column_negative_words = ['Negative_Words']
positive_words_df = pd.read_csv('LM2018P.csv', header=None, names=column_positive_words)
negative_words_df = pd.read_csv('LM2018N.csv', header=None, names=column_negative_words)

In [44]:
#1.	Convert all letters to lowercase
zacks_args_df['arguments_clean'] = zacks_args_df['arguments_clean'].str.lower()
positive_words_df['Postive_Words'] = positive_words_df['Postive_Words'].str.lower()
negative_words_df['Negative_Words'] = negative_words_df['Negative_Words'].str.lower()

In [45]:
#2.	Remove all special characters. 
def remove_special_characters_double_spaces(text):
    clean_text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text
zacks_args_df['arguments_clean'] = zacks_args_df['arguments_clean'].apply(remove_special_characters_double_spaces)
positive_words_df['Postive_Words'] = positive_words_df['Postive_Words'].apply(remove_special_characters_double_spaces)
negative_words_df['Negative_Words'] = negative_words_df['Negative_Words'].apply(remove_special_characters_double_spaces)

In [46]:
#3.	Tokenization. Please use unigram.
contains_joiners = positive_words_df['Postive_Words'].str.contains('[_\-.]')
rows_with_joiners = positive_words_df[contains_joiners]
print(rows_with_joiners)
contains_joiners = negative_words_df['Negative_Words'].str.contains('[_\-.]')
rows_with_joiners = negative_words_df[contains_joiners]
print(rows_with_joiners)
#The dictonary doesn't contain any word with word joiners, hence not checking the document. Proceeding to the next steps.

Empty DataFrame
Columns: [Postive_Words]
Index: []
Empty DataFrame
Columns: [Negative_Words]
Index: []


In [47]:
#4. Apply stopword list
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)
zacks_args_df['arguments_clean'] = zacks_args_df['arguments_clean'].apply(remove_stopwords)
positive_words_df['Postive_Words'] = positive_words_df['Postive_Words'].apply(remove_stopwords)
negative_words_df['Negative_Words'] = negative_words_df['Negative_Words'].apply(remove_stopwords)

In [48]:
#5.	Apply Lemmatization
# Load the English language model
# Make sure to download the model first using: python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")
def perform_lemmatization(text):
    doc = nlp(text)
    lemmatized_sentence = [token.lemma_ for token in doc]
    lemmatized_sentence = ' '.join(lemmatized_sentence)
    return lemmatized_sentence
zacks_args_df['arguments_clean'] = zacks_args_df['arguments_clean'].apply(perform_lemmatization)

In [49]:
#Applying Lemmatization to the dictionary
positive_words_df['Postive_Words'] = positive_words_df['Postive_Words'].apply(perform_lemmatization)
negative_words_df['Negative_Words'] = negative_words_df['Negative_Words'].apply(perform_lemmatization)

In [50]:
#Sentiment score with lemmatization and without negate
#Convert the positive and negative word lists into sets for faster lookup
positive_words = set(positive_words_df['Postive_Words'])
negative_words = set(negative_words_df['Negative_Words'])
def count_positive_words(text):
    return sum(1 for word in text.lower().split() if word in positive_words)
def count_negative_words(text):
    return sum(1 for word in text.lower().split() if word in negative_words)
# Count positive and negative words
zacks_args_df['positive'] = zacks_args_df['arguments_clean'].apply(count_positive_words)
zacks_args_df['negative'] = zacks_args_df['arguments_clean'].apply(count_negative_words)
# Calculate sentiment score using the formula
def calculate_sentiment(zacks_args_df):
    zacks_args_df['sentiment_score'] = (zacks_args_df['positive'] - zacks_args_df['negative']) / (zacks_args_df['positive'] + zacks_args_df['negative'] + 1)
    return zacks_args_df
zacks_args_df = calculate_sentiment(zacks_args_df)

In [51]:
#Sentiment score with Lemmatization and negate
negation_words = {'not', 'no', 'never', 'nobody', 'nothing', 'none', 'never', 'hardly', 'scarcely', 'barely'}
def calculate_sentiment_with_negation(text, positive_words, negative_words, negation_words):
    # Tokenize and lemmatize your text first (assuming it's done outside this function)
    tokens = text.split()  # Simple split based on spaces; replace with your lemmatization + tokenization logic
    positive = 0
    negative = 0
    i = 0
    while i < len(tokens):
        word = tokens[i]
        if word in negation_words and i + 1 < len(tokens):
            next_word = tokens[i + 1]
            # Flip the sentiment of the word following the negation
            if next_word in positive_words:
                negative += 1
            elif next_word in negative_words:
                positive += 1
            i += 2  # Skip the next word as it's already considered
        else:
            if word in positive_words:
                positive += 1
            elif word in negative_words:
                negative += 1
            i += 1
    # Calculate the sentiment score considering negation
    sentiment_score = (positive - negative) / (positive + negative + 1)
    return sentiment_score
zacks_args_df['sentiment_score_with_negation'] = zacks_args_df['arguments_clean'].apply(
    lambda x: calculate_sentiment_with_negation(
        x, 
        positive_words, 
        negative_words, 
        negation_words
    )
)

In [52]:
#Sentiment score using NLTK Vader's compound score 
# Download VADER lexicon
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
def calculate_sentiment_score_NLTK_Vader(text):
    scores = sia.polarity_scores(text)
    return scores['compound'] 
zacks_args_df['sentiment_score_NLTK_Vader'] = zacks_args_df['arguments_clean'].apply(calculate_sentiment_score_NLTK_Vader)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/balubabu/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [53]:
#Calculating the propotion of buys predicted correctly for all 3 cases
def proportion_of_buy(df, sort_column):
    # Sort the DataFrame based on a specified column
    sorted_df = df.sort_values(by=sort_column, ascending = False)
    top_100 = sorted_df.head(100)
    proportion = (top_100['label'] == 'buy').mean()
    return proportion
# Calculate the proportion for each column
proportion_sentiment = proportion_of_buy(zacks_args_df, 'sentiment_score')
proportion_sentiment_with_negate = proportion_of_buy(zacks_args_df, 'sentiment_score_with_negation')
proportion_sentiment_NLTK_Vader = proportion_of_buy(zacks_args_df, 'sentiment_score_NLTK_Vader')

print(f'Number of "Buy"s in the first 100 rows when sorted by sentiment (Lemmatization without negate): {proportion_sentiment*100}')
print(f'Number of "Buy"s in the first 100 rows when sorted by sentiment (Lemmatization with negate): {proportion_sentiment_with_negate*100}')
print(f'Number of "Buy"s in the first 100 rows when sorted by sentiment (NLTK_Vader): {proportion_sentiment_NLTK_Vader*100}')

Number of "Buy"s in the first 100 rows when sorted by sentiment (Lemmatization without negate): 75.0
Number of "Buy"s in the first 100 rows when sorted by sentiment (Lemmatization with negate): 75.0
Number of "Buy"s in the first 100 rows when sorted by sentiment (NLTK_Vader): 88.0
