In [1]:
import pandas as pd

In [2]:
'''
This function finds all 10-K words in the LM dictionary and returns a dictionary of all LM words.

'''
#convert LM dictionary to dataframe
def all_10k_words(csv_file):
    LM_dict_df = pd.read_csv('LM_dict.csv')
    LM_dict_df.head()
    source_col = LM_dict_df['Source']
    words_10k = {}
    #gets all words that are 10K/10Q
    for i in range(len(source_col)):
        word_source = LM_dict_df.loc[i,"Source"]
        if word_source != "12of12inf":
            index_word = (LM_dict_df.loc[i,"Word"])
            words_10k[index_word] = 1
    return words_10k
# need all positive and negative words that are 10K/10Q, next step is to see which one of these are +/-
    

In [3]:
# need all positive and negative words that are 10K/10Q 
# must use read_excel function since spreadsheet has multiple tabs/sheets so read_csv cannot handle
# header = None because no column names here
'''
This function finds all 10-K words of a given category from financial statements in LM lexicon.

input: 
(1) category (str) - positive,negative,constraining..., any of the sentiments in the SentimentWordLists file
(2) master_10k - dictionary of all 10-k words
output:
(1) not_in_vader[dict{word (str):sentiment(str)}], returns word:sentiment
'''
def category_10k_word_list(category,master_10k):
    wordList = pd.read_excel('SentimentWordLists.xlsx', category, header=None)
    category_10k_words = {}
    for word in wordList[0]:
        if word in master_10k: #looks through list of all 10-k words to see if the word comes from financial statements
            category_10k_words[word]=category
    return category_10k_words

In [4]:
'''
This function uses a soft-coded score approach to find the valence of words that are not in the VADER lexicon by 
setting the valence to a default negative value if negative word and a default positive value if positive word. 
If the word is in the VADER lexicon, we use the VADER existing valence

input: 
(1) some_words [dict{word (str):sentiment(str)}] - 10-k words we're checking against VADER to define a valence
(2) sentiment_score_pos (float) - default score for positive words
(3) sentiment_score_neg (float) - default score for negative words

output:
(1) not_in_vader [dict{word (str): valence (float)}] - all words NOT in VADER from input dictionary with soft-coded valences
'''
def defaults(some_words,sentiment_score_pos,sentiment_score_neg):
    file = open('vader_lexicon.txt','r')
    lex_dict = {}
    for line in file: #loading all words and sentiments from VADER lexicon into a dicitonary
        if not line:
            continue
        (word, measure) = line.strip().split('\t')[0:2]
        lex_dict[word] = float(measure)

    #creating dictionary to hold words not in vader
    not_in_vader = {}
    for word in some_words:
        lowercase_word = word.lower()
        if lowercase_word not in lex_dict: #O(1), lower because LM words are all caps so must compare when cases are same
            if some_words[word]=='Positive': #assigning preset scores to each word depending of if +/-
                not_in_vader[lowercase_word]=float(sentiment_score_pos)
            else:
                not_in_vader[lowercase_word]=float(sentiment_score_neg)
    file.close() #resetting file pointer to the top
    return not_in_vader

In [5]:
'''
This function uses a calibrated score approach to find the valence of words that are not in the VADER lexicon by 
taking the avg valence of words that ARE in VADER lexicon and making that the default sentiment score for those that aren't.

input: 
(1) some_words [dict{word (str):sentiment (value not really relevant here)}] - 10-k words to calibrate with VADER


output:
(1) not_in_vader [dict{word (str):calibrated_score (float)}] - all words NOT in VADER from input dictionary, 
with calibrated score
'''
def calibrated(some_words):
    file = open('vader_lexicon.txt','r')
    lex_dict = {}
    for line in file: #loading all words and sentiments from VADER lexicon into a dicitonary
        if not line:
            continue
        (word, measure) = line.strip().split('\t')[0:2]
        lex_dict[word] = float(measure)

    '''
    creating dictionary to hold words in vader, list to hold words in common btwn VADER and LM, list to hold words in LM 
    but not vader, and counter and score summer for averages
    '''
    not_in_vader = {}
    in_vader_list = []
    not_in_vader_list = []
    num_words = 0 
    score = 0
    #going thru all 10k LM words to find which exist in LM and which do not, doing sums for avg for those that exist
    #in LM for our calibrated approach
    for word in some_words:
        lowercase_word = word.lower()
        if lowercase_word in lex_dict: #O(1), lower because LM words are all caps so must compare when cases are same
            num_words += 1
            score += lex_dict[lowercase_word]
            in_vader_list.append(lowercase_word)
        if lowercase_word not in lex_dict:
            not_in_vader_list.append(lowercase_word)
    calibrated_avg = float(score/num_words) #calculating avg, calibrated score here
    for w in not_in_vader_list: #creating dictionary with LM words not in vader (key), and calibrated score (value)
        not_in_vader[w]=float(calibrated_avg)
    file.close()
    return not_in_vader

In [6]:
"""
This function adds a set of words from a dictionary to the VADER lexicon create a final,complete lexicon as a txt file.

input: 
(1) filename [str] - desired text file name (e.g. 'final_lexicon.txt') to write final lexicon to
(2) original_lexicon [str] - old lexicon that we add to (e.g. the VADER lexicon)
(3) lexicon_to_add [dict{word(str):valence(float)}] - lexicon to be added to the old elxicon


output:
(1) .txt file in working directory with complete lexicon
"""
def create_lexicon(filename,original_lexicon,lexicon_to_add):
    with open(original_lexicon) as f:
        with open(filename, "w") as f1:
            for line in f: #copying old lexicon over to our new text file
                f1.write(line) 
            for (k,v) in lexicon_to_add.items():
                f1.write(k + "\t" + str(v) + '\n') #adding words from new lexicon in desired tab-separated format
    return None

In [7]:
"""
calling all our functions to create lexicons with two different approaches: (1) default positive and negatives and (2) calibrated
"""
words_10k = all_10k_words('LM_dict.csv')

negative_10k_words = category_10k_word_list('Positive',words_10k)
positive_10k_words = category_10k_word_list('Negative',words_10k)
positive_and_neg_10k_words = {**negative_10k_words,**positive_10k_words} #merging the two dicts

pos_neg_10k_not_in_vader_default = defaults(positive_and_neg_10k_words,2,-2)
pos_neg_10k_not_in_vader_calibrate = calibrated(positive_and_neg_10k_words)

create_lexicon("default.txt","vader_lexicon.txt",pos_neg_10k_not_in_vader_default)
create_lexicon("calibrated.txt","vader_lexicon.txt",pos_neg_10k_not_in_vader_calibrate)

