 ### Objectives
 * Number of words in report
 * Number of hyperlinks in the report
 * Number of unique domains referenced in the hyperlinks (i.e. Disney.com counts as one domain, regardless of the page accessed)
 * The number of unique entities (using the Stanford NER tool)
 * The % of characters in the text that are numeric (i.e., (# of numeric characters/# of all alphanumeric characters) * 100)

In [25]:
import nltk
from nltk.tag.stanford import StanfordNERTagger
import validators
from urllib.parse import urlparse
import numpy as np
import pandas as pd
from alive_progress import alive_bar
import os
import unicodedata
import time

In [26]:
PATH_TO_JAR='/Users/alekseyvalouev/Desktop/Internship/disclosure-analysis/stanford-ner/stanford-ner.jar'
PATH_TO_MODEL = '/Users/alekseyvalouev/Desktop/Internship/disclosure-analysis/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz'

In [27]:
tagger = StanfordNERTagger(model_filename=PATH_TO_MODEL,path_to_jar=PATH_TO_JAR, encoding='utf-8')

In [28]:
file = open("test.txt", "r")
data = file.read()

words = nltk.word_tokenize(data) 

In [29]:
tagged = tagger.tag(words)

In [30]:
real_entities =  [i for i in tagged if i[1] != "O"]

In [31]:
def get_unique_entities(inp):
    values = []
    out = []
    for i in inp:
        if i[0] not in values:
            out.append(i)
            values.append(i[0])
            
    return out

In [32]:
def get_entity_count(line):
    entities = [i for i in tagger.tag(nltk.word_tokenize(line)) if i[1] != "O"]
    return len(get_unique_entities(entities)), len(entities)

def get_word_count(line):
    return len(line.split(" "))

def get_hyperlink_count(line):
    words = line.split(" ")
    i = 0
    domains = []
    for word in words:
        if (validators.url(word)):
            domain = urlparse(word).netloc
            if domain not in domains:
                domains.append(domain)
            i+=1
    return i, len(domains)

def get_numeric_percent(line):
    numbers = sum(c.isdigit() for c in line)
    try:
        return np.round((numbers / len(line))*100, 2)
    except ZeroDivisionError:
        return 0


In [33]:
def get_new_row(filename, line):
    line = line.replace(u'\xa0', u' ')
    line = line.replace(u'\n', u' ')
    unique_ner, ner = get_entity_count(line)
    wordcount = get_word_count(line)
    hcount, dcount = get_hyperlink_count(line)
    numeric = get_numeric_percent(line)
    return {'Filename': filename, 'Wordcount' : wordcount, 'Hyperlinks' : hcount, 'Domains' : dcount, 'Unique NER Entities' : unique_ner, 'Total NER Entities': ner, '% Numeric Characters' : numeric}

In [34]:
df = pd.DataFrame(columns = ['Filename', 'Wordcount', 'Hyperlinks', 'Domains', 'Unique NER Entities', 'Total NER Entities', '% Numeric Characters'])

In [35]:
# assign directory
directory = 'No Disclaimer Reports'

file_count = len(os.listdir(directory))
 
# progress bar
with alive_bar(file_count, force_tty=True) as bar:
    # loop through files
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        # checking if it is a file
        if os.path.isfile(f):
            with open(f, "r", encoding='utf-8') as f:
                data = f.read()
                add = get_new_row(filename, data)
                df = pd.concat([df, pd.DataFrame(add, index=[0])]).reset_index(drop=True)
                 
        bar()

|████████████████████████████████████████| 2476/2476 [100%] in 1:37:36.7 (0.42/s)                                       


In [39]:
df

Unnamed: 0,Filename,Wordcount,Hyperlinks,Domains,Unique NER Entities,Total NER Entities,% Numeric Characters
0,300301_Aurelius_BOFI_2015_6.p.txt,2314,30,11,38,138,2.89
1,74_Bronte Capital_UTA_2010_6.p.txt,1363,24,6,12,28,2.72
2,472_Trinity Research Group_VNET_2014_1.p.txt,34847,428,48,502,2613,5.49
3,277_Asensio_OLED_2013_14.p.txt,1592,49,4,39,93,2.18
4,91044_Quintessential_ON_2019.p.txt,4638,0,0,41,138,1.93
...,...,...,...,...,...,...,...
2470,307002_WhiteDiamondResearch_AXDX_2019_1.p.txt,6361,70,16,73,171,2.96
2471,331_Richard Pearson_UNIS_2013_1.p.txt,7560,83,12,92,321,2.44
2472,359_Absaroka Capital_GEDU_2011_1.p.txt,8166,3,1,151,836,2.87
2473,630_Mithra Forensic Research_VIPS_2015_3.p.txt,3818,30,3,25,82,2.90


In [38]:
df.to_csv("out.csv", index=False)