 ### Objectives
 * Number of words in report
 * Number of hyperlinks in the report
 * Number of unique domains referenced in the hyperlinks (i.e. Disney.com counts as one domain, regardless of the page accessed)
 * The number of unique entities (using the Stanford NER tool)
 * The % of characters in the text that are numeric (i.e., (# of numeric characters/# of all alphanumeric characters) * 100)

In [59]:
import nltk
from nltk.tag.stanford import StanfordNERTagger
import validators
from urllib.parse import urlparse
import numpy as np
import pandas as pd
from alive_progress import alive_bar
import os
import unicodedata
import time

In [60]:
PATH_TO_JAR='/Users/alekseyvalouev/Desktop/Internship/disclosure-analysis/stanford-ner/stanford-ner.jar'
PATH_TO_MODEL = '/Users/alekseyvalouev/Desktop/Internship/disclosure-analysis/stanford-ner/classifiers/english.muc.7class.distsim.crf.ser.gz'

In [43]:
tagger = StanfordNERTagger(model_filename=PATH_TO_MODEL,path_to_jar=PATH_TO_JAR, encoding='utf-8')

In [44]:
fin_terms = pd.read_csv("financial_statement_terms.csv")
sec_terms = pd.read_csv("SEC_filing_terms.csv")

In [45]:
list_fin_terms = [i[1:] if i[0] == " " else i for i in list(fin_terms['Term'])]
list_sec_terms = [i[1:] if i[0] == " " else i for i in list(sec_terms['Term'])]

In [46]:
file = open("test.txt", "r")
data = file.read()

words = nltk.word_tokenize(data) 

In [47]:
tagged = tagger.tag(words)

In [48]:
real_entities =  [i for i in tagged if i[1] != "O"]

print(real_entities)

[('Some', 'ORGANIZATION'), ('Company', 'ORGANIZATION'), ('Inc.', 'ORGANIZATION'), ('SomeOtherCompany', 'ORGANIZATION'), ('Inc', 'ORGANIZATION'), ('$', 'MONEY'), ('23.00', 'MONEY'), ('10', 'PERCENT'), ('%', 'PERCENT'), ('$', 'MONEY'), ('12', 'MONEY')]


In [79]:
def get_unique_entities(inp):
    values = []
    out = []
    for i in inp:
        if i[0] not in values:
            out.append(i)
            values.append(i[0])
            
    return out

def count_terms(terms, line):
    out = []
    for i in terms:
        if (len(re.findall("[^a-zA-Z\d]" + i.lower() + "[^a-zA-Z\d]", line.lower())) > 0):
            print(re.findall("[^a-zA-Z\d]" + i.lower() + "[^a-zA-Z\d]", line.lower()))
            out.extend(re.findall("[^a-zA-Z\d]" + i.lower() + "[^a-zA-Z\d]", line.lower()))
    print(out)
    return sum([len(re.findall("[^a-zA-Z\d]" + i.lower() + "[^a-zA-Z\d]", line.lower())) for i in terms])

In [93]:
def get_entity_count(line):
    tagged = tagger.tag(nltk.word_tokenize(line))
    entities = [i for i in tagged if i[1] != "O"]
    percent_entities = [i for i in tagged if (i[1] == "PERCENT" and i[0] != "%")]
    dollar_entities = [i for i in tagged if (i[1] == "MONEY" and i[0] != "$")]
    return len(get_unique_entities(entities)), len(entities), len(percent_entities), len(dollar_entities)

def get_word_count(line):
    pattern = re.compile("^[a-zA-Z.,?!'()\"-/)]+$")
    #return len([i for i in line.split(" ") if (pattern.match(i) != None)])
    return len(line.split(" "))

def get_hyperlink_count(line):
    words = line.split(" ")
    i = 0
    domains = []
    for word in words:
        if (validators.url(word)):
            domain = urlparse(word).netloc
            if domain not in domains:
                domains.append(domain)
            i+=1
    return i, len(domains)

def get_numeric_percent(line):
    numbers = sum(c.isdigit() for c in line)
    try:
        return np.round((numbers / len(line))*100, 2)
    except ZeroDivisionError:
        return 0
    
def get_term_count(line):
    return count_terms(list_fin_terms, line), count_terms(list_sec_terms, line)

In [51]:
def get_new_row(filename, line):
    line = line.replace(u'\xa0', u' ')
    line = line.replace(u'\n', u' ')
    unique_ner, ner, percent_ner, dollar_ner = get_entity_count(line)
    wordcount = get_word_count(line)
    hcount, dcount = get_hyperlink_count(line)
    numeric = get_numeric_percent(line)
    count_fin, count_sec = get_term_count(line)
    return {'Filename': filename, 'Wordcount' : wordcount, 'Hyperlinks' : hcount, 
            'Domains' : dcount, 'Unique NER Entities' : unique_ner, 'Total NER Entities': ner, 
            '$ NER' : dollar_ner, '% NER' : percent_ner, '% Numeric Characters' : numeric,
            'Financial Terms' : count_fin, 'SEC Terms' : count_sec}

In [68]:
df = pd.DataFrame(columns = ['Filename', 'Wordcount', 'Hyperlinks', 'Domains', 'Unique NER Entities', 'Total NER Entities', '$ NER', '% NER', '% Numeric Characters', 'Financial Terms', 'SEC Terms'])

In [69]:
# assign directory
directory = 'No Disclaimer Reports'

file_count = len(os.listdir(directory))
 
# progress bar
with alive_bar(file_count, force_tty=True) as bar:
    # loop through files
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        # checking if it is a file
        if os.path.isfile(f):
            with open(f, "r", encoding='utf-8') as f:
                data = f.read()
                add = get_new_row(filename, data)
                df = pd.concat([df, pd.DataFrame(add, index=[0])]).reset_index(drop=True)
                 
        bar()

|⚠︎                                       | (!) 2/2476 [0%] in 5.1s (0.39/s)                                             


KeyboardInterrupt: 

In [70]:
df

Unnamed: 0,Filename,Wordcount,Hyperlinks,Domains,Unique NER Entities,Total NER Entities,$ NER,% NER,% Numeric Characters,Financial Terms,SEC Terms
0,300301_Aurelius_BOFI_2015_6.p.txt,2314,30,11,88,181,2,2,2.89,52,12
1,74_Bronte Capital_UTA_2010_6.p.txt,1363,24,6,51,104,24,10,2.72,10,3


In [58]:
df.to_csv("out.csv", index=False)

In [95]:
with open("Short Selling Reports-- Text/2_Muddy Waters_DGW_2011.p.txt", "r") as f:
    data = f.read()
    print(data)

Disclaimer:
Use of Muddy Waters LLC’s research is at your own risk.  You should do your own research and due diligence before making any investment decision 
with respect to securities covered herein.  You should assume that as of the publication date of any report, Muddy Waters, LLC (possibly along with or 
through our members, partners, affiliates, employees, and/or consultants) along with our clients and/or investors has a short position in the stock (and/or 
options of the stock) covered herein, and therefore stands to realize significant gains in the event that the price of stock declines.  Following publication 
of any report, we intend to continue transacting in the securities covered therein, and we may be long, short, or neutral at any time hereafter regardless 
of our initial recommendation.   This is not an offer to sell or a solicitation of an offer to buy any security, nor shall any security be offered or sold to any 
person, in any jurisdiction in which such offer would b

In [90]:
pattern = re.compile("^[a-zA-Z.,?!'()\"-/)]+$")
print(pattern.match("K-a"))

<re.Match object; span=(0, 3), match='K-a'>
