In [2]:
import nltk
import numpy as np
import pandas as pd
import pickle
import pprint

from tqdm import tqdm

In [3]:
import project_helper

In [13]:
cik_lookup = {
    'AIG' : '0000005272',
    'AXP' : '0000004962',
    'BAC' : '0000070858',
    'C'   : '0000831001',
    'WFC' : '0000072971',
    'BRK' : '0001067983',
    'MS'  : '0000895421',
    'GS'  : '0000886982',
    'JPM' : '0000019617',
    'MA'  : '0001141391'}
cik_lookup1 = {
    'AAPL': '0000320193',
    'ALTR': '0000768251',
    'AMAT': '0000006951',
    'CSCO': '0000858877',
    'EMC' : '0001408146',
    'HPQ' : '0000047217',
    'IBM' : '0000051143',
    'INTC': '0000050863',
    'MSFT': '0000789019',
    'MU'  : '0000723125'}
cik_lookup2 = {
    'AMGN': '0001100542',
    'BMY' : '0000014272',
    'CELG': '0000816284',
    'ESRX': '0000885721',
    'GILD': '0000882095',
    'BIIB': '0000875045',
    'JNJ' : '0000200406',
    'LLY' : '0000059478',
    'MDT' : '0000064670',
    'CVS' : '0000064803'
}

In [5]:
from bs4 import BeautifulSoup

In [15]:

def get_sec_data(cik, doc_type, start=0, count=60):
    rss_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany' \
        '&CIK={}&type={}&start={}&count={}&owner=exclude&output=atom' \
        .format(cik, doc_type, start, count)
    sec_data = sec_api.get(rss_url)
    feed = BeautifulSoup(sec_data.encode('ascii'),'html').feed
    entries = [
        (
            entry.content.find('filing-href').getText(),                                
            entry.content.find('filing-type').getText(),
            entry.content.find('filing-date').getText())
        for entry in feed.find_all('entry', recursive=False)]

    return entries

In [16]:
sec_api = project_helper.SecAPI()

In [20]:
raw_fillings_by_ticker = {}

for ticker, data in sec_data.items():
    raw_fillings_by_ticker[ticker] = {}
    for index_url, file_type, file_date in tqdm(data, desc='Downloading {} Fillings'.format(ticker), unit='filling'):
        if (file_type == '10-K'):            file_url = index_url.replace('-index.htm', '.txt').replace('.txtl', '.txt')            
        raw_fillings_by_ticker[ticker][file_date] = sec_api.get(file_url)


Downloading AIG Fillings: 100%|███████████████████████████████████████████████████| 39/39 [04:53<00:00,  7.53s/filling]
Downloading AXP Fillings: 100%|███████████████████████████████████████████████████| 31/31 [02:10<00:00,  4.22s/filling]
Downloading BAC Fillings: 100%|███████████████████████████████████████████████████| 28/28 [03:58<00:00,  8.52s/filling]
Downloading C Fillings: 100%|█████████████████████████████████████████████████████| 38/38 [03:29<00:00,  5.50s/filling]
Downloading WFC Fillings: 100%|███████████████████████████████████████████████████| 31/31 [02:35<00:00,  5.01s/filling]
Downloading BRK Fillings: 100%|███████████████████████████████████████████████████| 22/22 [01:17<00:00,  3.53s/filling]
Downloading MS Fillings: 100%|████████████████████████████████████████████████████| 26/26 [02:19<00:00,  5.35s/filling]
Downloading GS Fillings: 100%|████████████████████████████████████████████████████| 23/23 [02:12<00:00,  5.75s/filling]
Downloading JPM Fillings: 100%|█████████

In [28]:
example_ticker = 'AXP'
print('Example Document:\n\n{}...'.format(next(iter(raw_fillings_by_ticker[example_ticker].values()))[:1000]))

Example Document:

<SEC-DOCUMENT>0000004962-20-000030.txt : 20200213
<SEC-HEADER>0000004962-20-000030.hdr.sgml : 20200213
<ACCEPTANCE-DATETIME>20200213160442
ACCESSION NUMBER:		0000004962-20-000030
CONFORMED SUBMISSION TYPE:	10-K
PUBLIC DOCUMENT COUNT:		162
CONFORMED PERIOD OF REPORT:	20191231
FILED AS OF DATE:		20200213
DATE AS OF CHANGE:		20200213

FILER:

	COMPANY DATA:	
		COMPANY CONFORMED NAME:			AMERICAN EXPRESS CO
		CENTRAL INDEX KEY:			0000004962
		STANDARD INDUSTRIAL CLASSIFICATION:	FINANCE SERVICES [6199]
		IRS NUMBER:				134922250
		STATE OF INCORPORATION:			NY
		FISCAL YEAR END:			1231

	FILING VALUES:
		FORM TYPE:		10-K
		SEC ACT:		1934 Act
		SEC FILE NUMBER:	001-07657
		FILM NUMBER:		20610369

	BUSINESS ADDRESS:	
		STREET 1:		200 VESEY STREET
		STREET 2:		50TH FLOOR
		CITY:			NEW YORK
		STATE:			NY
		ZIP:			10285
		BUSINESS PHONE:		2126402000

	MAIL ADDRESS:	
		STREET 1:		200 VESEY STREET
		STREET 2:		50TH FLOOR
		CITY:			NEW YORK
		STATE:			NY
		ZIP:			10285
</SEC-HEADER

In [26]:
import re


def get_documents(text):

    # TODO: Implement
    extracted_docs = []
    
    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')   
    
    doc_start_is = [x.end() for x in doc_start_pattern.finditer(text)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(text)]
    
    for doc_start_i, doc_end_i in zip(doc_start_is, doc_end_is):
            extracted_docs.append(text[doc_start_i:doc_end_i])
    
    return extracted_docs


#project_tests.test_get_documents(get_documents)

In [27]:
filling_documents_by_ticker = {}

for ticker, raw_fillings in raw_fillings_by_ticker.items():
    filling_documents_by_ticker[ticker] = {}
    for file_date, filling in tqdm(raw_fillings.items(), desc='Getting Documents from {} Fillings'.format(ticker), unit='filling'):
        filling_documents_by_ticker[ticker][file_date] = get_documents(filling)


print('\n\n'.join([
    'Document {} Filed on {}:\n{}...'.format(doc_i, file_date, doc[:200])
    for file_date, docs in filling_documents_by_ticker[example_ticker].items()
    for doc_i, doc in enumerate(docs)][:3]))

Getting Documents from AIG Fillings: 100%|████████████████████████████████████████| 38/38 [00:11<00:00,  3.24filling/s]
Getting Documents from AXP Fillings: 100%|████████████████████████████████████████| 31/31 [01:03<00:00,  2.04s/filling]
Getting Documents from BAC Fillings: 100%|████████████████████████████████████████| 28/28 [00:13<00:00,  2.03filling/s]
Getting Documents from C Fillings: 100%|██████████████████████████████████████████| 38/38 [00:12<00:00,  3.08filling/s]
Getting Documents from WFC Fillings: 100%|████████████████████████████████████████| 31/31 [00:14<00:00,  2.18filling/s]
Getting Documents from BRK Fillings: 100%|████████████████████████████████████████| 22/22 [00:02<00:00,  9.18filling/s]
Getting Documents from MS Fillings: 100%|█████████████████████████████████████████| 26/26 [00:14<00:00,  1.76filling/s]
Getting Documents from GS Fillings: 100%|█████████████████████████████████████████| 23/23 [00:56<00:00,  2.44s/filling]
Getting Documents from JPM Fillings: 100

Document 0 Filed on 2020-02-13:

<TYPE>10-K
<SEQUENCE>1
<FILENAME>axp-20191231.htm
<DESCRIPTION>FORM 10-K OF AMERICAN EXPRESS COMPANY
<TEXT>
<XBRL>
<?xml version="1.0" ?><!--XBRL Document Created with Wdesk from Workiva--><!--Copyri...

Document 1 Filed on 2020-02-13:

<TYPE>EX-4.2
<SEQUENCE>2
<FILENAME>axp-20191231exx42.htm
<DESCRIPTION>EX-4.2
<TEXT>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"><html><head>
...

Document 2 Filed on 2020-02-13:

<TYPE>EX-10.41
<SEQUENCE>3
<FILENAME>axp-20191231exx1041.htm
<DESCRIPTION>EX-10.41
<TEXT>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"><html><...


In [29]:
def remove_html_tags(text):
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    return text


def clean_text(text):
    text = text.lower()
    text = remove_html_tags(text)
    
    return text

In [32]:
def get_document_type(doc):
    
    
    # TODO: Implement
    type_pattern = re.compile(r'<TYPE>[^\n]+')
    
    doc_type = type_pattern.findall(doc)[0][len('<TYPE>'):] 
    
    return doc_type.lower()

In [33]:
ten_ks_by_ticker = {}

for ticker, filling_documents in filling_documents_by_ticker.items():
    ten_ks_by_ticker[ticker] = []
    for file_date, documents in filling_documents.items():
        for document in documents:
            if get_document_type(document) == '10-k':
                ten_ks_by_ticker[ticker].append({
                    'cik': cik_lookup[ticker],
                    'file': document,
                    'file_date': file_date})


In [34]:
for ticker, ten_ks in ten_ks_by_ticker.items():
    for ten_k in tqdm(ten_ks, desc='Cleaning {} 10-Ks'.format(ticker), unit='10-K'):
        ten_k['file_clean'] = clean_text(ten_k['file'])

Cleaning AIG 10-Ks: 100%|████████████████████████████████████████████████████████████| 38/38 [14:59<00:00, 23.67s/10-K]
Cleaning AXP 10-Ks: 100%|████████████████████████████████████████████████████████████| 31/31 [01:25<00:00,  2.75s/10-K]
Cleaning BAC 10-Ks: 100%|████████████████████████████████████████████████████████████| 30/30 [18:19<00:00, 36.65s/10-K]
Cleaning C 10-Ks: 100%|██████████████████████████████████████████████████████████████| 38/38 [12:08<00:00, 19.18s/10-K]
Cleaning WFC 10-Ks: 100%|████████████████████████████████████████████████████████████| 31/31 [00:20<00:00,  1.5410-K/s]
Cleaning BRK 10-Ks: 100%|████████████████████████████████████████████████████████████| 24/24 [01:50<00:00,  4.62s/10-K]
Cleaning MS 10-Ks: 100%|█████████████████████████████████████████████████████████████| 26/26 [05:50<00:00, 13.50s/10-K]
Cleaning GS 10-Ks: 100%|█████████████████████████████████████████████████████████████| 23/23 [04:18<00:00, 11.24s/10-K]
Cleaning JPM 10-Ks: 100%|███████████████

In [35]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet


def lemmatize_words(words):
    
    lemmatized_words = [WordNetLemmatizer().lemmatize(word, 'v') for word in words]
    
    return lemmatized_words

In [38]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [39]:
word_pattern = re.compile('\w+')

for ticker, ten_ks in ten_ks_by_ticker.items():
    for ten_k in tqdm(ten_ks, desc='Lemmatize {} 10-Ks'.format(ticker), unit='10-K'):
        ten_k['file_lemma'] = lemmatize_words(word_pattern.findall(ten_k['file_clean']))

Lemmatize AIG 10-Ks: 100%|███████████████████████████████████████████████████████████| 38/38 [00:25<00:00,  1.5110-K/s]
Lemmatize AXP 10-Ks: 100%|███████████████████████████████████████████████████████████| 31/31 [00:07<00:00,  4.0510-K/s]
Lemmatize BAC 10-Ks: 100%|███████████████████████████████████████████████████████████| 30/30 [00:15<00:00,  1.8910-K/s]
Lemmatize C 10-Ks: 100%|█████████████████████████████████████████████████████████████| 38/38 [00:19<00:00,  1.9210-K/s]
Lemmatize WFC 10-Ks: 100%|███████████████████████████████████████████████████████████| 31/31 [00:02<00:00, 11.3010-K/s]
Lemmatize BRK 10-Ks: 100%|███████████████████████████████████████████████████████████| 24/24 [00:06<00:00,  3.5110-K/s]
Lemmatize MS 10-Ks: 100%|████████████████████████████████████████████████████████████| 26/26 [00:11<00:00,  2.1710-K/s]
Lemmatize GS 10-Ks: 100%|████████████████████████████████████████████████████████████| 23/23 [00:10<00:00,  2.2010-K/s]
Lemmatize JPM 10-Ks: 100%|██████████████

In [40]:
from nltk.corpus import stopwords


lemma_english_stopwords = lemmatize_words(stopwords.words('english'))

for ticker, ten_ks in ten_ks_by_ticker.items():
    for ten_k in tqdm(ten_ks, desc='Remove Stop Words for {} 10-Ks'.format(ticker), unit='10-K'):
        ten_k['file_lemma'] = [word for word in ten_k['file_lemma'] if word not in lemma_english_stopwords]

Remove Stop Words for AIG 10-Ks: 100%|███████████████████████████████████████████████| 38/38 [00:10<00:00,  3.6710-K/s]
Remove Stop Words for AXP 10-Ks: 100%|███████████████████████████████████████████████| 31/31 [00:02<00:00, 10.5210-K/s]
Remove Stop Words for BAC 10-Ks: 100%|███████████████████████████████████████████████| 30/30 [00:07<00:00,  3.9710-K/s]
Remove Stop Words for C 10-Ks: 100%|█████████████████████████████████████████████████| 38/38 [00:08<00:00,  4.3510-K/s]
Remove Stop Words for WFC 10-Ks: 100%|███████████████████████████████████████████████| 31/31 [00:01<00:00, 24.3410-K/s]
Remove Stop Words for BRK 10-Ks: 100%|███████████████████████████████████████████████| 24/24 [00:03<00:00,  7.3810-K/s]
Remove Stop Words for MS 10-Ks: 100%|████████████████████████████████████████████████| 26/26 [00:05<00:00,  5.0110-K/s]
Remove Stop Words for GS 10-Ks: 100%|████████████████████████████████████████████████| 23/23 [00:04<00:00,  5.0010-K/s]
Remove Stop Words for JPM 10-Ks: 100%|██

In [61]:
sentiments = ['negative', 'positive', 'uncertainty', 'litigious', 'constraining']

sentiment_df = pd.read_csv('C:/Users/KIIT/Downloads/LoughranMcDonald_MasterDictionary_2018.csv')
sentiment_df.columns = [column.lower() for column in sentiment_df.columns] # Lowercase the columns for ease of use

# Remove unused information
sentiment_df = sentiment_df[sentiments + ['word']]
sentiment_df[sentiments] = sentiment_df[sentiments].astype(bool)
sentiment_df = sentiment_df[(sentiment_df[sentiments]).any(1)]

# Apply the same preprocessing to these word as the 10-k word
sentiment_df['word'] = lemmatize_words(sentiment_df['word'].str.lower())
sentiment_df = sentiment_df.drop_duplicates('word')


sentiment_df.head()

Unnamed: 0,negative,positive,uncertainty,litigious,constraining,word
9,True,False,False,False,False,abandon
12,True,False,False,False,False,abandonment
13,True,False,False,False,False,abandonments
51,True,False,False,False,False,abdicate
54,True,False,False,False,False,abdication


In [None]:
sentiment_

In [64]:
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer


def get_bag_of_words(sentiment_words, docs):   

    vec = CountVectorizer(vocabulary=sentiment_words)
    vectors = vec.fit_transform(docs)
    words_list = vec.get_feature_names()
    bag_of_words = np.zeros([len(docs), len(words_list)])
    
    for i in range(len(docs)):
        bag_of_words[i] = vectors[i].toarray()[0]

    return bag_of_words.astype(int)


In [66]:
sentiment_bow_ten_ks = {}

for ticker, ten_ks in ten_ks_by_ticker.items():
    lemma_docs = [' '.join(ten_k['file_lemma']) for ten_k in ten_ks]
    
    sentiment_bow_ten_ks[ticker] = {
        sentiment: get_bag_of_words(sentiment_df[sentiment_df[sentiment]]['word'], lemma_docs)
        for sentiment in sentiments}

In [71]:
sentiment_bow_ten_ks['JPM']['positive'].shape

(37, 249)

In [76]:
from sklearn.metrics import jaccard_similarity_score


def get_jaccard_similarity(bag_of_words_matrix):
    
    
    # TODO: Implement
    jaccard_similarities = []
    bag_of_words_matrix = np.array(bag_of_words_matrix, dtype=bool)
    
    for i in range(len(bag_of_words_matrix)-1):
            u = bag_of_words_matrix[i]
            v = bag_of_words_matrix[i+1]
            jaccard_similarities.append(jaccard_similarity_score(u,v))   
    
    return jaccard_similarities


In [77]:
file_dates = {
    ticker: [ten_k['file_date'] for ten_k in ten_ks]
    for ticker, ten_ks in ten_ks_by_ticker.items()}  

jaccard_similarities = {
    ticker: {
        sentiment_name: get_jaccard_similarity(sentiment_values)
        for sentiment_name, sentiment_values in ten_k_sentiments.items()}
    for ticker, ten_k_sentiments in sentiment_bow_ten_ks.items()}

















In [80]:
jaccard_similarities_df_dict = {'date': [], 'ticker': [], 'sentiment': [], 'value': []}


for ticker, ten_k_sentiments in jaccard_similarities.items():
    for sentiment_name, sentiment_values in ten_k_sentiments.items():
        for sentiment_values, sentiment_value in enumerate(sentiment_values):
            jaccard_similarities_df_dict['ticker'].append(ticker)
            jaccard_similarities_df_dict['sentiment'].append(sentiment_name)
            jaccard_similarities_df_dict['value'].append(sentiment_value)
            jaccard_similarities_df_dict['date'].append(file_dates[ticker][1:][sentiment_values])

jaccard_similarities_df = pd.DataFrame(jaccard_similarities_df_dict)                                                 
jaccard_similarities_df['date'] = pd.DatetimeIndex(jaccard_similarities_df['date']).year
jaccard_similarities_df['date'] = pd.to_datetime(jaccard_similarities_df['date'], format='%Y')


jaccard_similarities_df

Unnamed: 0,date,ticker,sentiment,value
0,2019-01-01,AIG,negative,0.986166
1,2018-01-01,AIG,negative,0.967721
2,2017-01-01,AIG,negative,1.000000
3,2017-01-01,AIG,negative,0.975626
4,2016-01-01,AIG,negative,0.973650
5,2015-01-01,AIG,negative,0.978920
6,2014-01-01,AIG,negative,0.973650
7,2013-01-01,AIG,negative,0.953887
8,2012-01-01,AIG,negative,1.000000
9,2012-01-01,AIG,negative,1.000000


In [90]:
jaccard_similarities_df.groupby(['date','ticker']).max()['value']

date        ticker
1994-01-01  AIG       0.991632
            AXP       1.000000
            BAC       0.993412
            C         1.000000
            JPM       1.000000
                        ...   
2019-01-01  GS        0.987952
            JPM       0.987448
            MA        0.991632
            MS        0.991632
            WFC       1.000000
Name: value, Length: 237, dtype: float64

In [96]:
data = pd.DataFrame()
columns = ['date','ticker']

In [97]:
data = jaccard_similarities_df.loc[:,columns]

In [100]:
data.drop_duplicates(inplace=True)

In [102]:
data.reset_index(inplace=True)

In [103]:
data.drop('index',axis=1,inplace=True)

In [112]:
data[data['ticker'] == 'GS']

Unnamed: 0,date,ticker,sentiment
174,2019-01-01,GS,positive
175,2018-01-01,GS,positive
176,2017-01-01,GS,uncertainty
177,2016-01-01,GS,negative
178,2015-01-01,GS,positive
179,2014-01-01,GS,litigious
180,2013-01-01,GS,uncertainty
181,2012-01-01,GS,uncertainty
182,2011-01-01,GS,uncertainty
183,2010-01-01,GS,litigious


In [107]:
for i in range(237):
    df = jaccard_similarities_df[jaccard_similarities_df['date'] == data.loc[i,'date']]
    df = df[df['ticker'] == data.loc[i,'ticker']]
    a = df['value'].idxmax()
    data.loc[i,'sentiment'] = df.loc[a,'sentiment']
    
        

In [115]:
data.to_csv('Sentiment.csv')