In [1]:
import pandas as pd
import chardet
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/balubabu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
#Detecting the encoding of the given file
with open('zacks_arguments.csv', 'rb') as file:
    sample = file.read(10000)  # Read first 10000 bytes as a sample
    detected = chardet.detect(sample)
    encoding = detected['encoding']
df = pd.read_csv('zacks_arguments.csv', encoding=encoding)
#Reading the files into the dataframe
zacks_args_df = pd.read_csv('zacks_arguments.csv', encoding=encoding)

In [4]:
#Reading the positive and negative word lists into the dataframe 
column_positive_words = ['Postive_Words']
column_negative_words = ['Negative_Words']
positive_words_df = pd.read_csv('LM2018P.csv', header=None, names=column_positive_words)
negative_words_df = pd.read_csv('LM2018N.csv', header=None, names=column_negative_words)

In [5]:
#1.	Convert all letters to lowercase
zacks_args_df['arguments_clean'] = zacks_args_df['arguments_clean'].str.lower()
positive_words_df['Postive_Words'] = positive_words_df['Postive_Words'].str.lower()
negative_words_df['Negative_Words'] = negative_words_df['Negative_Words'].str.lower()

In [6]:
#2.	Remove all special characters. 
def remove_special_characters_double_spaces(text):
    clean_text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text
zacks_args_df['arguments_clean'] = zacks_args_df['arguments_clean'].apply(remove_special_characters_double_spaces)
positive_words_df['Postive_Words'] = positive_words_df['Postive_Words'].apply(remove_special_characters_double_spaces)
negative_words_df['Negative_Words'] = negative_words_df['Negative_Words'].apply(remove_special_characters_double_spaces)

In [7]:
#3.	Tokenization. Please use unigram.
contains_joiners = positive_words_df['Postive_Words'].str.contains('[_\-.]')
rows_with_joiners = positive_words_df[contains_joiners]
print(rows_with_joiners)
contains_joiners = negative_words_df['Negative_Words'].str.contains('[_\-.]')
rows_with_joiners = negative_words_df[contains_joiners]
print(rows_with_joiners)
#The dictonary doesn't contain any word with word joiners, hence not checking the document. Proceeding to the next steps.

Empty DataFrame
Columns: [Postive_Words]
Index: []
Empty DataFrame
Columns: [Negative_Words]
Index: []


In [8]:
#Average length of arguments_clean before stopwords removal (in terms of words)

zacks_args_df['word_count_before_sw_removal'] = zacks_args_df['arguments_clean'].apply(lambda x: len(x.split()))
average_word_count = zacks_args_df['word_count_before_sw_removal'].mean()
print("Average word count per description:", average_word_count)

Average word count per description: 415.2234762979684


In [9]:
#4. Apply stopword list
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)
zacks_args_df['arguments_clean'] = zacks_args_df['arguments_clean'].apply(remove_stopwords)

zacks_args_df['word_count'] = zacks_args_df['arguments_clean'].apply(lambda x: len(x.split()))
average_word_count = zacks_args_df['word_count'].mean()
print("Average word count per description:", average_word_count)

Average word count per description: 262.37697516930024


In [10]:
print(len(positive_words_df['Postive_Words']))
print(len(negative_words_df['Negative_Words']))

347
2345


In [11]:
positive_words_df['Postive_Words'] = positive_words_df['Postive_Words'].apply(remove_stopwords)
negative_words_df['Negative_Words'] = negative_words_df['Negative_Words'].apply(remove_stopwords)

In [12]:
print(len(positive_words_df['Postive_Words']))
print(len(negative_words_df['Negative_Words']))

347
2345


In [13]:
#Before Lemmatization
zacks_args_df.loc[0,'arguments_clean']

'ambarella well known market leading high performance video processing socs consume lowest power space companys proprietary video image processing socs highly configurable providing cost power advantage rivals uses multiple expensive semiconductors video image processing solutions makes ambarella suitable choice wearable camera ip camera automotive dashboard cameras drone camera makers move diversify business lower dependency gopro ambarella forayed vr camera space launching h3 soc january year company claims chip work wonderfully highend drones vr cameras although tough ambarella compete well established players nvidia advanced micro believe given track record innovation company potential strengthen position space nearly automakers various stages developing selfdriving cars creating huge demand camera based socs well computer vision technology notably ambarella already deep technical knowledge camerabased socs enhance computer vision capabilities company acquired vislab jul 2015 visla

In [14]:
#5.	Apply Lemmatization
# Load the English language model
# Make sure to download the model first using: python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

def perform_lemmatization(text):
    doc = nlp(text)
    lemmatized_sentence = [token.lemma_ for token in doc]
    lemmatized_sentence = ' '.join(lemmatized_sentence)
    return lemmatized_sentence
zacks_args_df['arguments_clean'] = zacks_args_df['arguments_clean'].apply(perform_lemmatization)

In [15]:
zacks_args_df.loc[0,'arguments_clean']

'ambarella well know market lead high performance video processing soc consume low power space companys proprietary video image processing soc highly configurable provide cost power advantage rival use multiple expensive semiconductor video image processing solution make ambarella suitable choice wearable camera ip camera automotive dashboard cameras drone camera maker move diversify business low dependency gopro ambarella foray vr camera space launch h3 soc january year company claim chip work wonderfully highend drones vr camera although tough ambarella compete well establish player nvidia advanced micro believe give track record innovation company potential strengthen position space nearly automaker various stage develop selfdrive car create huge demand camera base socs well computer vision technology notably ambarella already deep technical knowledge camerabase socs enhance computer vision capability company acquire vislab jul 2015 vislab develop concept computer vision intelligent

In [16]:
#Applying Lemmatization to the dictionary
positive_words_df['Postive_Words'] = positive_words_df['Postive_Words'].apply(perform_lemmatization)
negative_words_df['Negative_Words'] = negative_words_df['Negative_Words'].apply(perform_lemmatization)

In [20]:
#Calculating the sentiment score
#Convert the positive and negative word lists into sets for faster lookup
positive_words = set(positive_words_df['Postive_Words'])
negative_words = set(negative_words_df['Negative_Words'])

def count_positive_words(text):
    return sum(1 for word in text.lower().split() if word in positive_words)

def count_negative_words(text):
    return sum(1 for word in text.lower().split() if word in negative_words)

# Count positive and negative words
zacks_args_df['positive'] = zacks_args_df['arguments_clean'].apply(count_positive_words)
zacks_args_df['negative'] = zacks_args_df['arguments_clean'].apply(count_negative_words)

# Calculate sentiment score using the formula
def calculate_sentiment(zacks_args_df):
    zacks_args_df['sentiment_score'] = (zacks_args_df['positive'] - zacks_args_df['negative']) / (zacks_args_df['positive'] + zacks_args_df['negative'] + 1)
    return zacks_args_df

zacks_args_df = calculate_sentiment(zacks_args_df)


In [21]:
# Number of records where positive count is greater than negative count in zacks_args_df
positive_greater_than_negative = len(zacks_args_df[zacks_args_df['positive'] > zacks_args_df['negative']])

# Number of records where negative count is greater than positive count in zacks_args_df
negative_greater_than_positive = len(zacks_args_df[zacks_args_df['negative'] > zacks_args_df['positive']])

print("Number of records in zacks_args_df where positive > negative:", positive_greater_than_negative)
print("Number of records in zacks_args_df where negative > positive:", negative_greater_than_positive)

Number of records in zacks_args_df where positive > negative: 314
Number of records in zacks_args_df where negative > positive: 98


In [22]:
def count_sentiment_records(df):
    positive_sentiment_count = len(df[df['sentiment_score'] > 0])
    negative_sentiment_count = len(df[df['sentiment_score'] < 0])
    return positive_sentiment_count, negative_sentiment_count
# Use the function and print results
positive_count, negative_count = count_sentiment_records(zacks_args_df)
print("Number of records with sentiment score > 0:", positive_count)
print("Number of records with sentiment score < 0:", negative_count)

Number of records with sentiment score > 0: 314
Number of records with sentiment score < 0: 98


In [23]:
#Sentiment score with Lemmatization and negate

negation_words = {'not', 'no', 'never', 'nobody', 'nothing', 'none', 'never', 'hardly', 'scarcely', 'barely'}

def calculate_sentiment_with_negation(text, positive_words, negative_words, negation_words):
    # Tokenize and lemmatize your text first (assuming it's done outside this function)
    tokens = text.split()  # Simple split based on spaces; replace with your lemmatization + tokenization logic

    positive = 0
    negative = 0

    i = 0
    while i < len(tokens):
        word = tokens[i]
        if word in negation_words and i + 1 < len(tokens):
            next_word = tokens[i + 1]
            # Flip the sentiment of the word following the negation
            if next_word in positive_words:
                negative += 1
            elif next_word in negative_words:
                positive += 1
            i += 2  # Skip the next word as it's already considered
        else:
            if word in positive_words:
                positive += 1
            elif word in negative_words:
                negative += 1
            i += 1

    # Calculate the sentiment score considering negation
    sentiment_score = (positive - negative) / (positive + negative + 1)
    return sentiment_score

zacks_args_df['sentiment_score_with_negation'] = zacks_args_df['arguments_clean'].apply(
    lambda x: calculate_sentiment_with_negation(
        x, 
        positive_words, 
        negative_words, 
        negation_words
    )
)


In [24]:
zacks_args_df.head(10)

Unnamed: 0,ID,report_name,ticker,report_date,arguments_clean,label,word_count_before_sw_removal,word_count,positive,negative,sentiment_score,sentiment_score_with_negation
0,1,"Ambarella, Inc._Attachment1(2).pdf",AMBA,1/30/2018,ambarella well know market lead high performan...,sell,252,158,9,1,0.727273,0.727273
1,2,"Ambarella, Inc._Attachment1(2).pdf",AMBA,6/11/2018,ambarella well know market lead high performan...,sell,257,172,10,0,0.909091,0.909091
2,3,"Ambarella, Inc._Attachment1(3).pdf",AMBA,10/30/2018,ambarella make steady progress development del...,buy,472,306,19,1,0.857143,0.857143
3,4,"Ambarella, Inc._Attachment1(3).pdf",AMBA,11/26/2020,ambarella make steady progress development del...,sell,442,286,21,0,0.954545,0.954545
4,5,"Ambarella, Inc._Attachment1(4).pdf",AMBA,12/3/2018,ambarella make steady progress development del...,buy,472,306,19,1,0.857143,0.857143
5,6,"Ambarella, Inc._Attachment1.pdf",AMBA,6/14/2017,ambarella well know market lead high performan...,buy,438,281,15,1,0.823529,0.823529
6,7,"Ambarella, Inc._Attachment1.pdf",AMBA,1/5/2018,ambarella well know market lead high performan...,buy,437,281,15,1,0.823529,0.823529
7,8,"Ambarella, Inc._Attachment1.pdf",AMBA,6/4/2018,ambarella well know market lead high performan...,buy,429,275,15,1,0.823529,0.823529
8,9,"Ambarella, Inc._Attachment1.pdf",AMBA,9/6/2018,ambarella effort toward expand reach market ip...,sell,504,328,19,1,0.857143,0.857143
9,10,"Ambarella, Inc._Attachment1.pdf",AMBA,11/18/2020,ambarella make steady progress development del...,buy,24,16,1,0,0.5,0.5


In [25]:
filtered_df = zacks_args_df[zacks_args_df['sentiment_score'] != zacks_args_df['sentiment_score_with_negation']]
print(filtered_df)

Empty DataFrame
Columns: [ID, report_name, ticker, report_date, arguments_clean, label, word_count_before_sw_removal, word_count, positive, negative, sentiment_score, sentiment_score_with_negation]
Index: []


In [26]:
#Sentiment score using NLTK Vader's compound score 
# Download VADER lexicon
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

def calculate_sentiment_score_NLTK_Vader(text):
    scores = sia.polarity_scores(text)
    return scores['compound']
    
zacks_args_df['sentiment_score_NLTK_Vader'] = zacks_args_df['arguments_clean'].apply(calculate_sentiment_score_NLTK_Vader)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/balubabu/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [27]:
zacks_args_df.head(10)

Unnamed: 0,ID,report_name,ticker,report_date,arguments_clean,label,word_count_before_sw_removal,word_count,positive,negative,sentiment_score,sentiment_score_with_negation,sentiment_score_NLTK_Vader
0,1,"Ambarella, Inc._Attachment1(2).pdf",AMBA,1/30/2018,ambarella well know market lead high performan...,sell,252,158,9,1,0.727273,0.727273,0.9776
1,2,"Ambarella, Inc._Attachment1(2).pdf",AMBA,6/11/2018,ambarella well know market lead high performan...,sell,257,172,10,0,0.909091,0.909091,0.9753
2,3,"Ambarella, Inc._Attachment1(3).pdf",AMBA,10/30/2018,ambarella make steady progress development del...,buy,472,306,19,1,0.857143,0.857143,0.9969
3,4,"Ambarella, Inc._Attachment1(3).pdf",AMBA,11/26/2020,ambarella make steady progress development del...,sell,442,286,21,0,0.954545,0.954545,0.9967
4,5,"Ambarella, Inc._Attachment1(4).pdf",AMBA,12/3/2018,ambarella make steady progress development del...,buy,472,306,19,1,0.857143,0.857143,0.9969
5,6,"Ambarella, Inc._Attachment1.pdf",AMBA,6/14/2017,ambarella well know market lead high performan...,buy,438,281,15,1,0.823529,0.823529,0.9931
6,7,"Ambarella, Inc._Attachment1.pdf",AMBA,1/5/2018,ambarella well know market lead high performan...,buy,437,281,15,1,0.823529,0.823529,0.9931
7,8,"Ambarella, Inc._Attachment1.pdf",AMBA,6/4/2018,ambarella well know market lead high performan...,buy,429,275,15,1,0.823529,0.823529,0.994
8,9,"Ambarella, Inc._Attachment1.pdf",AMBA,9/6/2018,ambarella effort toward expand reach market ip...,sell,504,328,19,1,0.857143,0.857143,0.9973
9,10,"Ambarella, Inc._Attachment1.pdf",AMBA,11/18/2020,ambarella make steady progress development del...,buy,24,16,1,0,0.5,0.5,0.6249


In [28]:
#calculating the score
def proportion_of_buy(df, sort_column):
    # Sort the DataFrame based on a specified column
    sorted_df = df.sort_values(by=sort_column, ascending = False)
    top_100 = sorted_df.head(100)
    proportion = (top_100['label'] == 'buy').mean()
    return proportion

# Calculate the proportion for each column
proportion_sentiment = proportion_of_buy(zacks_args_df, 'sentiment_score')
proportion_sentiment_with_negate = proportion_of_buy(zacks_args_df, 'sentiment_score_with_negation')
proportion_sentiment_NLTK_Vader = proportion_of_buy(zacks_args_df, 'sentiment_score_NLTK_Vader')

print(f'Proportion of "Buy" in the first 100 rows when sorted by sentiment: {proportion_sentiment:.2f}')
print(f'Proportion of "Buy" in the first 100 rows when sorted by sentiment_with_negate: {proportion_sentiment_with_negate:.2f}')
print(f'Proportion of "Buy" in the first 100 rows when sorted by sentiment_NLTK_Vader: {proportion_sentiment_NLTK_Vader:.2f}')

Proportion of "Buy" in the first 100 rows when sorted by sentiment: 0.75
Proportion of "Buy" in the first 100 rows when sorted by sentiment_with_negate: 0.75
Proportion of "Buy" in the first 100 rows when sorted by sentiment_NLTK_Vader: 0.88
