In [1]:
import nltk
import numpy as np
import pandas as pd
import pickle
import pprint

from tqdm import tqdm

In [2]:
import project_helper

In [3]:
cik_lookup1 = {
    'AIG' : '0000005272',
    'AXP' : '0000004962',
    'BAC' : '0000070858',
    'C'   : '0000831001',
    'WFC' : '0000072971',
    'BRK' : '0001067983',
    'MS'  : '0000895421',
    'GS'  : '0000886982',
    'JPM' : '0000019617',
    'MA'  : '0001141391'}
cik_lookup = {
    'AAPL': '0000320193',
    'ALTR': '0000768251',
    'AMAT': '0000006951',
    'CSCO': '0000858877',
    'EMC' : '0001408146',
    'HPQ' : '0000047217',
    'IBM' : '0000051143',
    'INTC': '0000050863',
    'MSFT': '0000789019',
    'MU'  : '0000723125'}
cik_lookup2 = {
    'AMGN': '0001100542',
    'BMY' : '0000014272',
    'CELG': '0000816284',
    'ESRX': '0000885721',
    'GILD': '0000882095',
    'BIIB': '0000875045',
    'JNJ' : '0000200406',
    'LLY' : '0000059478',
    'MDT' : '0000064670',
    'CVS' : '0000064803'
}

In [4]:
from bs4 import BeautifulSoup

In [10]:
sec_data = {}
def get_sec_data(cik, doc_type, start=0, count=60):
    rss_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany' \
        '&CIK={}&type={}&start={}&count={}&owner=exclude&output=atom' \
        .format(cik, doc_type, start, count)
    sec_data = sec_api.get(rss_url)
    feed = BeautifulSoup(sec_data.encode('ascii'),'html').feed
    entries = [
        (
            entry.content.find('filing-href').getText(),
            entry.content.find('filing-type').getText(),
            entry.content.find('filing-date').getText())
        for entry in feed.find_all('entry', recursive=False)]

    return entries

In [11]:
sec_api = project_helper.SecAPI()

In [13]:
for ticker, cik in cik_lookup.items():
    sec_data[ticker] = get_sec_data(cik, '10-K')

In [14]:
raw_fillings_by_ticker = {}

for ticker, data in sec_data.items():
    raw_fillings_by_ticker[ticker] = {}
    for index_url, file_type, file_date in tqdm(data, desc='Downloading {} Fillings'.format(ticker), unit='filling'):
        if (file_type == '10-K'):            file_url = index_url.replace('-index.htm', '.txt').replace('.txtl', '.txt')            
        raw_fillings_by_ticker[ticker][file_date] = sec_api.get(file_url)


Downloading AAPL Fillings: 100%|██████████████████████████████████████████████████| 28/28 [01:04<00:00,  2.31s/filling]
Downloading ALTR Fillings: 100%|██████████████████████████████████████████████████| 25/25 [01:07<00:00,  2.69s/filling]
Downloading AMAT Fillings: 100%|██████████████████████████████████████████████████| 27/27 [01:03<00:00,  2.35s/filling]
Downloading CSCO Fillings: 100%|██████████████████████████████████████████████████| 27/27 [01:14<00:00,  2.75s/filling]
Downloading EMC Fillings: 100%|███████████████████████████████████████████████████| 18/18 [00:57<00:00,  3.18s/filling]
Downloading HPQ Fillings: 100%|███████████████████████████████████████████████████| 32/32 [01:39<00:00,  3.10s/filling]
Downloading IBM Fillings: 100%|███████████████████████████████████████████████████| 27/27 [01:30<00:00,  3.36s/filling]
Downloading INTC Fillings: 100%|██████████████████████████████████████████████████| 28/28 [04:11<00:00,  8.98s/filling]
Downloading MSFT Fillings: 100%|████████

In [15]:
example_ticker = 'AAPL'
print('Example Document:\n\n{}...'.format(next(iter(raw_fillings_by_ticker[example_ticker].values()))[:1000]))

Example Document:

<SEC-DOCUMENT>0000320193-19-000119.txt : 20191031
<SEC-HEADER>0000320193-19-000119.hdr.sgml : 20191031
<ACCEPTANCE-DATETIME>20191030181236
ACCESSION NUMBER:		0000320193-19-000119
CONFORMED SUBMISSION TYPE:	10-K
PUBLIC DOCUMENT COUNT:		96
CONFORMED PERIOD OF REPORT:	20190928
FILED AS OF DATE:		20191031
DATE AS OF CHANGE:		20191030

FILER:

	COMPANY DATA:	
		COMPANY CONFORMED NAME:			Apple Inc.
		CENTRAL INDEX KEY:			0000320193
		STANDARD INDUSTRIAL CLASSIFICATION:	ELECTRONIC COMPUTERS [3571]
		IRS NUMBER:				942404110
		STATE OF INCORPORATION:			CA
		FISCAL YEAR END:			0928

	FILING VALUES:
		FORM TYPE:		10-K
		SEC ACT:		1934 Act
		SEC FILE NUMBER:	001-36743
		FILM NUMBER:		191181423

	BUSINESS ADDRESS:	
		STREET 1:		ONE APPLE PARK WAY
		CITY:			CUPERTINO
		STATE:			CA
		ZIP:			95014
		BUSINESS PHONE:		(408) 996-1010

	MAIL ADDRESS:	
		STREET 1:		ONE APPLE PARK WAY
		CITY:			CUPERTINO
		STATE:			CA
		ZIP:			95014

	FORMER COMPANY:	
		FORMER CONFORMED NAME:	APPLE INC
	

In [16]:
import re


def get_documents(text):

    # TODO: Implement
    extracted_docs = []
    
    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')   
    
    doc_start_is = [x.end() for x in doc_start_pattern.finditer(text)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(text)]
    
    for doc_start_i, doc_end_i in zip(doc_start_is, doc_end_is):
            extracted_docs.append(text[doc_start_i:doc_end_i])
    
    return extracted_docs


#project_tests.test_get_documents(get_documents)

In [17]:
filling_documents_by_ticker = {}

for ticker, raw_fillings in raw_fillings_by_ticker.items():
    filling_documents_by_ticker[ticker] = {}
    for file_date, filling in tqdm(raw_fillings.items(), desc='Getting Documents from {} Fillings'.format(ticker), unit='filling'):
        filling_documents_by_ticker[ticker][file_date] = get_documents(filling)


print('\n\n'.join([
    'Document {} Filed on {}:\n{}...'.format(doc_i, file_date, doc[:200])
    for file_date, docs in filling_documents_by_ticker[example_ticker].items()
    for doc_i, doc in enumerate(docs)][:3]))

Getting Documents from AAPL Fillings: 100%|███████████████████████████████████████| 28/28 [00:00<00:00, 47.57filling/s]
Getting Documents from ALTR Fillings: 100%|███████████████████████████████████████| 25/25 [00:00<00:00, 34.67filling/s]
Getting Documents from AMAT Fillings: 100%|███████████████████████████████████████| 27/27 [00:02<00:00,  9.75filling/s]
Getting Documents from CSCO Fillings: 100%|███████████████████████████████████████| 27/27 [00:02<00:00, 11.24filling/s]
Getting Documents from EMC Fillings: 100%|████████████████████████████████████████| 18/18 [00:01<00:00, 14.83filling/s]
Getting Documents from HPQ Fillings: 100%|████████████████████████████████████████| 32/32 [00:03<00:00,  9.02filling/s]
Getting Documents from IBM Fillings: 100%|████████████████████████████████████████| 27/27 [00:04<00:00,  6.30filling/s]
Getting Documents from INTC Fillings: 100%|███████████████████████████████████████| 28/28 [00:03<00:00,  7.71filling/s]
Getting Documents from MSFT Fillings: 10

Document 0 Filed on 2019-10-31:

<TYPE>10-K
<SEQUENCE>1
<FILENAME>a10-k20199282019.htm
<DESCRIPTION>10-K
<TEXT>
<XBRL>
<?xml version="1.0" encoding="UTF-8"?>
<!--XBRL Document Created with Wdesk from Workiva-->
<!--p:d84c316ccb4b413...

Document 1 Filed on 2019-10-31:

<TYPE>EX-4.1
<SEQUENCE>2
<FILENAME>a10-kexhibit412019.htm
<DESCRIPTION>EXHIBIT 4.1
<TEXT>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>...

Document 2 Filed on 2019-10-31:

<TYPE>EX-10.15
<SEQUENCE>3
<FILENAME>a10-kexhibit10152019.htm
<DESCRIPTION>EXHIBIT 10.15
<TEXT>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
...





In [18]:
def remove_html_tags(text):
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    return text


def clean_text(text):
    text = text.lower()
    text = remove_html_tags(text)
    
    return text

In [19]:
def get_document_type(doc):
    
    
    # TODO: Implement
    type_pattern = re.compile(r'<TYPE>[^\n]+')
    
    doc_type = type_pattern.findall(doc)[0][len('<TYPE>'):] 
    
    return doc_type.lower()

In [20]:
ten_ks_by_ticker = {}

for ticker, filling_documents in filling_documents_by_ticker.items():
    ten_ks_by_ticker[ticker] = []
    for file_date, documents in filling_documents.items():
        for document in documents:
            if get_document_type(document) == '10-k':
                ten_ks_by_ticker[ticker].append({
                    'cik': cik_lookup[ticker],
                    'file': document,
                    'file_date': file_date})


In [21]:
for ticker, ten_ks in ten_ks_by_ticker.items():
    for ten_k in tqdm(ten_ks, desc='Cleaning {} 10-Ks'.format(ticker), unit='10-K'):
        ten_k['file_clean'] = clean_text(ten_k['file'])

Cleaning AAPL 10-Ks: 100%|███████████████████████████████████████████████████████████| 28/28 [00:41<00:00,  1.48s/10-K]
Cleaning ALTR 10-Ks: 100%|███████████████████████████████████████████████████████████| 25/25 [00:37<00:00,  1.52s/10-K]
Cleaning AMAT 10-Ks: 100%|███████████████████████████████████████████████████████████| 27/27 [01:08<00:00,  2.55s/10-K]
Cleaning CSCO 10-Ks: 100%|███████████████████████████████████████████████████████████| 27/27 [00:48<00:00,  1.80s/10-K]
Cleaning EMC 10-Ks: 100%|████████████████████████████████████████████████████████████| 18/18 [00:12<00:00,  1.4410-K/s]
Cleaning HPQ 10-Ks: 100%|████████████████████████████████████████████████████████████| 32/32 [02:06<00:00,  3.94s/10-K]
Cleaning IBM 10-Ks: 100%|████████████████████████████████████████████████████████████| 27/27 [00:08<00:00,  3.2910-K/s]
Cleaning INTC 10-Ks: 100%|███████████████████████████████████████████████████████████| 29/29 [01:12<00:00,  2.49s/10-K]
Cleaning MSFT 10-Ks: 100%|██████████████

In [22]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet


def lemmatize_words(words):
    
    lemmatized_words = [WordNetLemmatizer().lemmatize(word, 'v') for word in words]
    
    return lemmatized_words

In [23]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
word_pattern = re.compile('\w+')

for ticker, ten_ks in ten_ks_by_ticker.items():
    for ten_k in tqdm(ten_ks, desc='Lemmatize {} 10-Ks'.format(ticker), unit='10-K'):
        ten_k['file_lemma'] = lemmatize_words(word_pattern.findall(ten_k['file_clean']))

Lemmatize AAPL 10-Ks: 100%|██████████████████████████████████████████████████████████| 28/28 [00:15<00:00,  1.8010-K/s]
Lemmatize ALTR 10-Ks: 100%|██████████████████████████████████████████████████████████| 25/25 [00:09<00:00,  2.6310-K/s]
Lemmatize AMAT 10-Ks: 100%|██████████████████████████████████████████████████████████| 27/27 [00:13<00:00,  2.0310-K/s]
Lemmatize CSCO 10-Ks: 100%|██████████████████████████████████████████████████████████| 27/27 [00:13<00:00,  2.0410-K/s]
Lemmatize EMC 10-Ks: 100%|███████████████████████████████████████████████████████████| 18/18 [00:05<00:00,  3.3610-K/s]
Lemmatize HPQ 10-Ks: 100%|███████████████████████████████████████████████████████████| 32/32 [00:21<00:00,  1.4610-K/s]
Lemmatize IBM 10-Ks: 100%|███████████████████████████████████████████████████████████| 27/27 [00:03<00:00,  7.0610-K/s]
Lemmatize INTC 10-Ks: 100%|██████████████████████████████████████████████████████████| 29/29 [01:10<00:00,  2.42s/10-K]
Lemmatize MSFT 10-Ks: 100%|█████████████

In [28]:
from nltk.corpus import stopwords


lemma_english_stopwords = lemmatize_words(stopwords.words('english'))

for ticker, ten_ks in ten_ks_by_ticker.items():
    for ten_k in tqdm(ten_ks, desc='Remove Stop Words for {} 10-Ks'.format(ticker), unit='10-K'):
        ten_k['file_lemma'] = [word for word in ten_k['file_lemma'] if word not in lemma_english_stopwords]

Remove Stop Words for AAPL 10-Ks: 100%|██████████████████████████████████████████████| 28/28 [00:06<00:00,  4.5810-K/s]
Remove Stop Words for ALTR 10-Ks: 100%|██████████████████████████████████████████████| 25/25 [00:03<00:00,  6.4210-K/s]
Remove Stop Words for AMAT 10-Ks: 100%|██████████████████████████████████████████████| 27/27 [00:05<00:00,  5.2210-K/s]
Remove Stop Words for CSCO 10-Ks: 100%|██████████████████████████████████████████████| 27/27 [00:04<00:00,  5.9110-K/s]
Remove Stop Words for EMC 10-Ks: 100%|███████████████████████████████████████████████| 18/18 [00:02<00:00,  6.9510-K/s]
Remove Stop Words for HPQ 10-Ks: 100%|███████████████████████████████████████████████| 32/32 [00:08<00:00,  3.8610-K/s]
Remove Stop Words for IBM 10-Ks: 100%|███████████████████████████████████████████████| 27/27 [00:01<00:00, 16.9110-K/s]
Remove Stop Words for INTC 10-Ks: 100%|██████████████████████████████████████████████| 29/29 [00:36<00:00,  1.28s/10-K]
Remove Stop Words for MSFT 10-Ks: 100%|█

In [29]:
sentiments = ['negative', 'positive', 'uncertainty', 'litigious', 'constraining']

sentiment_df = pd.read_csv('C:/Users/KIIT/Downloads/LoughranMcDonald_MasterDictionary_2018.csv')
sentiment_df.columns = [column.lower() for column in sentiment_df.columns] # Lowercase the columns for ease of use

# Remove unused information
sentiment_df = sentiment_df[sentiments + ['word']]
sentiment_df[sentiments] = sentiment_df[sentiments].astype(bool)
sentiment_df = sentiment_df[(sentiment_df[sentiments]).any(1)]

# Apply the same preprocessing to these word as the 10-k word
sentiment_df['word'] = lemmatize_words(sentiment_df['word'].str.lower())
sentiment_df = sentiment_df.drop_duplicates('word')


sentiment_df.head()

Unnamed: 0,negative,positive,uncertainty,litigious,constraining,word
9,True,False,False,False,False,abandon
12,True,False,False,False,False,abandonment
13,True,False,False,False,False,abandonments
51,True,False,False,False,False,abdicate
54,True,False,False,False,False,abdication


In [30]:
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer


def get_bag_of_words(sentiment_words, docs):   

    vec = CountVectorizer(vocabulary=sentiment_words)
    vectors = vec.fit_transform(docs)
    words_list = vec.get_feature_names()
    bag_of_words = np.zeros([len(docs), len(words_list)])
    
    for i in range(len(docs)):
        bag_of_words[i] = vectors[i].toarray()[0]

    return bag_of_words.astype(int)


In [31]:
sentiment_bow_ten_ks = {}

for ticker, ten_ks in ten_ks_by_ticker.items():
    lemma_docs = [' '.join(ten_k['file_lemma']) for ten_k in ten_ks]
    
    sentiment_bow_ten_ks[ticker] = {
        sentiment: get_bag_of_words(sentiment_df[sentiment_df[sentiment]]['word'], lemma_docs)
        for sentiment in sentiments}

In [32]:
type(ten_ks_by_ticker)

dict

In [33]:
from sklearn.metrics import jaccard_similarity_score


def get_jaccard_similarity(bag_of_words_matrix):
    
    
    # TODO: Implement
    jaccard_similarities = []
    bag_of_words_matrix = np.array(bag_of_words_matrix, dtype=bool)
    
    for i in range(len(bag_of_words_matrix)-1):
            u = bag_of_words_matrix[i]
            v = bag_of_words_matrix[i+1]
            jaccard_similarities.append(jaccard_similarity_score(u,v))    
    
    return jaccard_similarities


In [34]:
file_dates = {
    ticker: [ten_k['file_date'] for ten_k in ten_ks]
    for ticker, ten_ks in ten_ks_by_ticker.items()}  

jaccard_similarities = {
    ticker: {
        sentiment_name: get_jaccard_similarity(sentiment_values)
        for sentiment_name, sentiment_values in ten_k_sentiments.items()}
    for ticker, ten_k_sentiments in sentiment_bow_ten_ks.items()}













In [35]:
jaccard_similarities_df_dict = {'date': [], 'ticker': [], 'sentiment': [], 'value': []}


for ticker, ten_k_sentiments in jaccard_similarities.items():
    for sentiment_name, sentiment_values in ten_k_sentiments.items():
        for sentiment_values, sentiment_value in enumerate(sentiment_values):
            jaccard_similarities_df_dict['ticker'].append(ticker)
            jaccard_similarities_df_dict['sentiment'].append(sentiment_name)
            jaccard_similarities_df_dict['value'].append(sentiment_value)
            jaccard_similarities_df_dict['date'].append(file_dates[ticker][1:][sentiment_values])

jaccard_similarities_df = pd.DataFrame(jaccard_similarities_df_dict)
jaccard_similarities_df['date'] = pd.DatetimeIndex(jaccard_similarities_df['date']).year
jaccard_similarities_df['date'] = pd.to_datetime(jaccard_similarities_df['date'], format='%Y')


jaccard_similarities_df

Unnamed: 0,date,ticker,sentiment,value
0,2018-01-01,AAPL,negative,0.990119
1,2017-01-01,AAPL,negative,0.984848
2,2016-01-01,AAPL,negative,0.990119
3,2015-01-01,AAPL,negative,0.988801
4,2014-01-01,AAPL,negative,0.992754
5,2013-01-01,AAPL,negative,0.994071
6,2012-01-01,AAPL,negative,0.991436
7,2011-01-01,AAPL,negative,0.990119
8,2010-01-01,AAPL,negative,0.970356
9,2010-01-01,AAPL,negative,1.000000


In [90]:
jaccard_similarities_df.groupby(['date','ticker']).max()['value']

date        ticker
1994-01-01  AIG       0.991632
            AXP       1.000000
            BAC       0.993412
            C         1.000000
            JPM       1.000000
                        ...   
2019-01-01  GS        0.987952
            JPM       0.987448
            MA        0.991632
            MS        0.991632
            WFC       1.000000
Name: value, Length: 237, dtype: float64

In [36]:
data = pd.DataFrame()
columns = ['date','ticker']

In [37]:
data = jaccard_similarities_df.loc[:,columns]

In [38]:
data.drop_duplicates(inplace=True)

In [39]:
data.reset_index(inplace=True)

In [40]:
data.drop('index',axis=1,inplace=True)

In [43]:
data

Unnamed: 0,date,ticker,sentiment
0,2018-01-01,AAPL,uncertainty
1,2017-01-01,AAPL,constraining
2,2016-01-01,AAPL,uncertainty
3,2015-01-01,AAPL,uncertainty
4,2014-01-01,AAPL,uncertainty
5,2013-01-01,AAPL,constraining
6,2012-01-01,AAPL,litigious
7,2011-01-01,AAPL,negative
8,2010-01-01,AAPL,negative
9,2009-01-01,AAPL,uncertainty


In [42]:
for i in range(230):
    df = jaccard_similarities_df[jaccard_similarities_df['date'] == data.loc[i,'date']]
    df = df[df['ticker'] == data.loc[i,'ticker']]
    a = df['value'].idxmax()
    data.loc[i,'sentiment'] = df.loc[a,'sentiment']
    
        

In [55]:
data.to_csv("Sentiment1.csv")