## Calculation of Term Weights

In [1]:
# Import packages
import pandas as pd
import numpy as np
import Load_MasterDictionary as lm
from tqdm import tqdm
import re
import glob
import string

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [2]:
# Load in master dictionary
dic_path = '/Users/anthony_ning/NYU/2024Internship/EDGAR/LoughranMcDonald_MasterDictionary_2014.csv'
md = lm.load_masterdictionary(dic_path, True)

 ...Loading Master Dictionary 85000
Master Dictionary loaded from file: 
  /Users/anthony_ning/NYU/2024Internship/EDGAR/LoughranMcDonald_MasterDictionary_2014.csv
  85,131 words loaded in master_dictionary.



### tf.idf Weights

In [3]:
# Create a list containing only negative words

wrd_lst = []

for word in md:
    if md[word].sentiment['negative']:
        wrd_lst.append(word)

#### Calculation of the 1st matrix: the raw count of each negative word in txt files

In [4]:
# The first matrix
total_negwrd = len(wrd_lst)

def get_matrix1(doc):
    
    # The occurence of each negative word in txt file
    _o_neg = [0] * total_negwrd

    tokens = re.findall('\w+', doc)  # Note that \w+ splits hyphenated words
    
    for token in tokens:
            
        if not token.isdigit() and len(token) > 1 and token in wrd_lst:
            _o_neg[wrd_lst.index(token)] += 1

    return np.array(_o_neg)

#### Calculation of the 2nd matrix: the number of documents containing at least one occurance of each negative word

In [5]:
# The second matrix
def get_matrix2(doc):
    
    # The occurence of each negative word in txt file
    _o_doc = [0] * total_negwrd

    tokens = re.findall('\w+', doc)  # Note that \w+ splits hyphenated words
    
    for token in tokens:
            
        if not token.isdigit() and len(token) > 1 and token in wrd_lst:
            _o_doc[wrd_lst.index(token)] = 1

    return np.array(_o_doc)

#### Calculation of the 3rd matrix: the total word count in txt files

In [6]:
# The third matrix
def get_matrix3(doc):
    
    _o_total = 0

    tokens = re.findall('\w+', doc)  # Note that \w+ splits hyphenated words
    
    for token in tokens:
            
        if not token.isdigit() and len(token) > 1 and token in md:
            _o_total += 1

    return _o_total

In [8]:
# Calculation of the three matrices for 10-K and 10-Q files
TARGET_FILES = './txt/*.txt'

file_list = glob.glob(TARGET_FILES)

matrix_1 = np.zeros((len(file_list), total_negwrd))
matrix_2 = np.zeros((len(file_list), total_negwrd))
matrix_3 = np.zeros((len(file_list), 1))

orders = []
i = 0

for filename in file_list:
    
    orders.append(filename)
    
    with open(filename, 'r', encoding='UTF-8', errors='ignore') as f_in:
        doc = f_in.read()

    doc = doc.upper()
    
    output_matrix1 = get_matrix1(doc)
    matrix_1[i] = output_matrix1
    output_matrix2 = get_matrix2(doc)
    matrix_2[i] = output_matrix2
    t_wrd = get_matrix3(doc)
    matrix_3[i] = t_wrd

    i += 1
    
matrix_2 = np.sum(matrix_2, axis=0)
matrix_1 = matrix_1.T

#### tf.idf calculation (please refer to the formula in the paper of Loughran and McDonald's)

In [9]:
N = len(file_list)
wrd_weight = np.zeros((total_negwrd, N))

for i in range(total_negwrd):
    for j in range(N):
        if matrix_1[i][j] == 0:
            wrd_weight[i][j] = 0
        else:
            wrd_weight[i][j] = (np.log(matrix_1[i][j])+1)/(np.log(matrix_3[j])+1)*np.log(N/matrix_2[i])

  wrd_weight[i][j] = (np.log(matrix_1[i][j])+1)/(np.log(matrix_3[j])+1)*np.log(N/matrix_2[i])


#### Term weights export to .csv format

In [10]:
# User defined output file
OUTPUT_FILE = './sp_testfile_weight.csv'

with open(OUTPUT_FILE, 'w') as f:
    for i in range(N+1):
        if i == 0:
            f.write(' ,')
        elif i== N:
            f.write(orders[i-1]+'\n')
        else:
            f.write(orders[i-1]+',')
    
    for i in range(total_negwrd):
        f.write(wrd_lst[i]+',')
        for j in range(N):
            if j != N-1:
                f.write(str(wrd_weight[i][j])+',')
            else:
                f.write(str(wrd_weight[i][j])+'\n')

### Proportional Weights

According to the paper of Loughran and McDonald's, the proportional weights are the word list counts relative to the total number of words appearing in txt files. 

In [11]:
def get_weight(doc):
    
    # The occurence of each negative word in txt file
    _percentage_neg = [0] * total_negwrd

    # The total word count
    total_wrd = 0

    tokens = re.findall('\w+', doc)  # Note that \w+ splits hyphenated words
    
    for token in tokens:

        if not token.isdigit() and len(token) > 1 and token in md:
            total_wrd += 1
            
        if not token.isdigit() and len(token) > 1 and token in wrd_lst:
            _percentage_neg[wrd_lst.index(token)] += 1

    return np.array(_percentage_neg)/total_wrd

In [13]:
# Calculate proportional weights for each 10-K and 10-Q file
pro_weights = np.zeros((N, total_negwrd))
pro_orders = []
i = 0

for filename in file_list:
    
    pro_orders.append(filename)
    
    with open(filename, 'r', encoding='UTF-8', errors='ignore') as f_in:
        doc = f_in.read()

    doc = doc.upper()
    pro_weights[i] = get_weight(doc)
    
    i += 1

pro_weights = pro_weights.T

In [14]:
# User defined output file
OUTPUT_FILE = './sp_testfile_proweight.csv'

with open(OUTPUT_FILE, 'w') as f:
    
    for i in range(N+1):
        if i == 0:
            f.write(' ,')
        elif i== N:
            f.write(pro_orders[i-1]+'\n')
        else:
            f.write(pro_orders[i-1]+',')
    
    for i in range(total_negwrd):
        f.write(wrd_lst[i]+',')
        for j in range(N):
            if j != N-1:
                f.write(str(pro_weights[i][j])+',')
            else:
                f.write(str(pro_weights[i][j])+'\n')