# Blackcoffer Internship Selection Task | Level 1

# Loading the Data

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!pip install rich
!pip install html2text
!pip install syllapy

Collecting rich
  Downloading rich-10.6.0-py3-none-any.whl (208 kB)
[?25l[K     |█▋                              | 10 kB 24.9 MB/s eta 0:00:01[K     |███▏                            | 20 kB 27.8 MB/s eta 0:00:01[K     |████▊                           | 30 kB 12.7 MB/s eta 0:00:01[K     |██████▎                         | 40 kB 10.1 MB/s eta 0:00:01[K     |███████▉                        | 51 kB 5.3 MB/s eta 0:00:01[K     |█████████▍                      | 61 kB 5.7 MB/s eta 0:00:01[K     |███████████                     | 71 kB 5.7 MB/s eta 0:00:01[K     |████████████▋                   | 81 kB 5.9 MB/s eta 0:00:01[K     |██████████████▏                 | 92 kB 6.0 MB/s eta 0:00:01[K     |███████████████▊                | 102 kB 6.3 MB/s eta 0:00:01[K     |█████████████████▎              | 112 kB 6.3 MB/s eta 0:00:01[K     |██████████████████▉             | 122 kB 6.3 MB/s eta 0:00:01[K     |████████████████████▍           | 133 kB 6.3 MB/s eta 0:00:01[K    

In [3]:
# Variables for loading data from gdrive
root_dir = "Dataset"
prj_dir = "Blackcoffer-task-1"

In [13]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import syllapy                   # for syllable count
import functools

import html2text
from lxml import html
from lxml import etree
import re
from rich.console import Console
from rich.style import Style
from html2text import html2text

nltk.download('punkt')
nltk.download('stopwords')
cachedStopWords = stopwords.words("english")
warning = Style(bold=True, color="white", bgcolor="magenta")
console = Console()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
report_links_df = pd.read_excel("/content/gdrive/MyDrive/{}/{}/cik_list.xlsx".format(root_dir, prj_dir))
report_links_df.head()

Unnamed: 0,CIK,CONAME,FYRMO,FDATE,FORM,SECFNAME
0,3662,SUNBEAM CORP/FL/,199803,1998-03-06,10-K405,edgar/data/3662/0000950170-98-000413.txt
1,3662,SUNBEAM CORP/FL/,199805,1998-05-15,10-Q,edgar/data/3662/0000950170-98-001001.txt
2,3662,SUNBEAM CORP/FL/,199808,1998-08-13,NT 10-Q,edgar/data/3662/0000950172-98-000783.txt
3,3662,SUNBEAM CORP/FL/,199811,1998-11-12,10-K/A,edgar/data/3662/0000950170-98-002145.txt
4,3662,SUNBEAM CORP/FL/,199811,1998-11-16,NT 10-Q,edgar/data/3662/0000950172-98-001203.txt


In [None]:
report_links_df.shape

(152, 6)

In [None]:
report_links_df['link'] = 'https://www.sec.gov/Archives/' + report_links_df['SECFNAME']
report_links_df.head(3)

Unnamed: 0,CIK,CONAME,FYRMO,FDATE,FORM,SECFNAME,link
0,3662,SUNBEAM CORP/FL/,199803,1998-03-06,10-K405,edgar/data/3662/0000950170-98-000413.txt,https://www.sec.gov/Archives/edgar/data/3662/0...
1,3662,SUNBEAM CORP/FL/,199805,1998-05-15,10-Q,edgar/data/3662/0000950170-98-001001.txt,https://www.sec.gov/Archives/edgar/data/3662/0...
2,3662,SUNBEAM CORP/FL/,199808,1998-08-13,NT 10-Q,edgar/data/3662/0000950172-98-000783.txt,https://www.sec.gov/Archives/edgar/data/3662/0...


Get the constraining and uncertainity words.

In [14]:
constraining_df = pd.read_excel("/content/gdrive/MyDrive/{}/{}/constraining_dictionary.xlsx".format(root_dir, prj_dir))
uncertainity_df = pd.read_excel("/content/gdrive/MyDrive/{}/{}/uncertainty_dictionary.xlsx".format(root_dir, prj_dir))

In [15]:
print("Constraining words: ", constraining_df.shape)
print("Uncertainity words: ", uncertainity_df.shape)

Constraining words:  (184, 1)
Uncertainity words:  (297, 1)


In [16]:
print("Every constraining word is uppercase: ", (constraining_df['Word'].str.isupper()).all())
print("Every uncertainity word is uppercase: ", (uncertainity_df['Word'].str.isupper()).all())

Every constraining word is uppercase:  True
Every uncertainity word is uppercase:  True


In [17]:
constraining_words = constraining_df['Word']
uncertainity_words = uncertainity_df['Word']

# Text Analysis

## 1. Sentiment Analysis

### 1.1 Cleaning Using stop words

In [18]:
stop_words = pd.read_table("/content/gdrive/MyDrive/{}/{}/StopWords_GenericLong.txt".format(root_dir, prj_dir),
                           header=None)[0]
stop_words

0               a
1             a's
2            able
3           about
4           above
          ...    
566         yours
567      yourself
568    yourselves
569             z
570          zero
Name: 0, Length: 571, dtype: object

In [19]:
stop_words.shape

(571,)

In [20]:
stop_words.str.islower().all()

True

In [21]:
# convert to uppercase to compare it 
stop_words = stop_words.str.upper()

### 1.2 Creating Dictionary of Positive and Negative Words
Convert to series for easier masking.

In [22]:
# Negative words series 
negative_words = pd.read_excel("/content/gdrive/MyDrive/{}/{}/LoughranMcDonald_SentimentWordLists_2018.xlsx".format(root_dir, prj_dir), 
                             sheet_name='Negative',
                             header=None)[0]
negative_words

0            ABANDON
1          ABANDONED
2         ABANDONING
3        ABANDONMENT
4       ABANDONMENTS
            ...     
2350      WRONGDOING
2351     WRONGDOINGS
2352        WRONGFUL
2353      WRONGFULLY
2354         WRONGLY
Name: 0, Length: 2355, dtype: object

In [23]:
negative_words = negative_words[ negative_words != stop_words.any() ]
negative_words

0            ABANDON
1          ABANDONED
2         ABANDONING
3        ABANDONMENT
4       ABANDONMENTS
            ...     
2350      WRONGDOING
2351     WRONGDOINGS
2352        WRONGFUL
2353      WRONGFULLY
2354         WRONGLY
Name: 0, Length: 2355, dtype: object

In [24]:
# Positive words series
positive_words = pd.read_excel("/content/gdrive/MyDrive/{}/{}/LoughranMcDonald_SentimentWordLists_2018.xlsx".format(root_dir, prj_dir), 
                             sheet_name='Positive',
                             header=None)[0]
positive_words

0            ABLE
1       ABUNDANCE
2        ABUNDANT
3       ACCLAIMED
4      ACCOMPLISH
          ...    
349           WIN
350        WINNER
351       WINNERS
352       WINNING
353        WORTHY
Name: 0, Length: 354, dtype: object

In [25]:
positive_words = positive_words[ positive_words != stop_words.any()]
positive_words

0            ABLE
1       ABUNDANCE
2        ABUNDANT
3       ACCLAIMED
4      ACCOMPLISH
          ...    
349           WIN
350        WINNER
351       WINNERS
352       WINNING
353        WORTHY
Name: 0, Length: 354, dtype: object

### 1.3 Extracting Derived variables
Downloading the text  SEC/EDGAR financial reports.

In [None]:
!pip install selenium
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

In [None]:
text_data = pd.Series([],dtype=pd.StringDtype())

**This download loop download sequentially all the text but can also skip if the request rate limit exceeds. The skipped text is downloaded in next iteration of outermost `while` loop until all the texts are downloaded**

I used `Selenium` for scrapping and `rich` for logging with headless driver to similute as much as real web browser and avoid detection of scrapping.

In [None]:
import urllib3
from selenium import webdriver

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

chrome = webdriver.Chrome('chromedriver',options=chrome_options)

while text_data.shape[0] != report_links_df.shape[0]:
  for index, link in enumerate(report_links_df['link']):
    if index not in text_data.index.values:
      with console.status("Downloading File {} of {}".format(text_data.shape[0] + 1, report_links_df.shape[0]), spinner="dots"):
        chrome = webdriver.Chrome('chromedriver',options=chrome_options)
        chrome.get(link)    
        page_source = chrome.page_source      

      if 'Request Rate Threshold Exceeded' in page_source:
        continue                                                       # go on to request for another file
      try:
        data = chrome.find_element_by_tag_name('pre').text
      except NoSuchElementException:
        continue
        
      text_data = text_data.append(pd.Series(data, index=[index]), ignore_index=False)
      console.log("Downloaded File {}".format(index))
    
report_links_df = report_links_df.assign(
    text = text_data
)
console.status("Successfully Downloaded files")

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

In [None]:
text_data.index.is_unique

True

**Save this DataFrame for further use without scrapping the text over the internet**

In [None]:
# with open('/content/gdrive/MyDrive/{}/{}/report_links_df.json'.format(root_dir, prj_dir), 'w') as f:
#     f.write(report_df.to_json())

**Parsing the SGML document**

In [26]:
PRE_EB = "-----BEGIN PRIVACY-ENHANCED MESSAGE-----"
POST_EB = "-----END PRIVACY-ENHANCED MESSAGE-----"

def unpack_pem(pem_string, index, continue_parsing):
    """Takes a PEM encapsulated message and index of link in DataFrame
    and returns a tuple
    consisting of the header and encapsulated text.  
    """
    if not pem_string.startswith(PRE_EB):
      if index in continue_parsing:
        return None, pem_string.strip()
        raise ValueError("Invalid PEM encoding in File {}; must start with [{}]; found [{}];".format(
                          index, PRE_EB, pem_string.strip()[:len(PRE_EB)]
        ))
    if not pem_string.strip().endswith(POST_EB):
        raise ValueError("Invalid PEM encoding in File {}; must end with [{}]; found [{}]".format(
                          index, POST_EB, pem_string.strip()[-len(POST_EB):]
        ))
    msg = pem_string.strip()[len(PRE_EB):-len(POST_EB)]
    header, encapsulated_text = msg.split('\n\n', 1)
    return (header, encapsulated_text)

In [27]:
replace = {"|": " ", "\n": " ",  "---": " ", "**": " ", 
           "#####": " ", "_": " ", "(": " ", ")": " "}
replace = dict((re.escape(k), v) for k, v in replace.items()) 
pattern = re.compile("|".join(replace.keys()))
  
def sgml_parser(data, index, continue_parsing,
                html_parser=False):
  """
  Input the string downloaded by provided link and returns parsed data
  ### Returns:
      `metadata`: Metadata containing 'type', 'sequence' and 'filename'
      `data`: The text data in uppercase
  ### Parameters:
      `data`: plain string 
      `index`: index in the DataFrame for error reporting
      `continue_parsing`: check `extract_data()` function
      `html_parser`(default `False`): set `True` if encapsulated text is not plain 
              text but html and parsable, OTHERWISE omit it by listing in 
              `skip_parsing` list of `extract_data()` function.
  """
  header, encapsulated_text = unpack_pem(data, index, continue_parsing)

  # Now parse the SGML
  root = html.fromstring(encapsulated_text)
  document = root.xpath('//document')[0]
  metadata = {}
  metadata['type'] = document.xpath('//type')[0].text.strip()
  metadata['sequence'] = document.xpath('//sequence')[0].text.strip()
  try:
    metadata['filename'] = (root.xpath('//sec-document')[0]
                            .text.split(':')[0]
                            .strip())
  except IndexError:
    metadata['filename'] = (root.xpath('//ims-document')[0]   # Some are IMS document
                            .text.split(':')[0]
                            .strip())
  
  if '/' in metadata['filename']:                              # Sometimes filename itself is a link in the downloaded doc
    metadata['filename'] = metadata['filename'].split('/')[-1] # this can raise ValueError in `extract_data()` func
                                                               # as it is required for confirming valid file.
  
  if html_parser:                                              # Somefiles have html instead of plain text
    node = document.xpath("//text")[0]                         # setting `html_parser` var will parse the document
    html_text = node.text + ''.join(etree.tostring(e, encoding="unicode") for e in node)
    data = html2text(html_text)
    # data = pattern.sub(lambda m: replace[re.escape(m.group(0))], str(data)).upper()
    # return metadata, " ".join(data)
    return metadata, pattern.sub(lambda m: replace[re.escape(m.group(0))], data).upper()
  else:
    inner_html = document.xpath('//text')[0]
    try:
      inner_html.xpath('//page')[0]                            # Some docs don't have <PAGE> tag which will raise error 
      data = []                                                # and then the data will be processed in Except block                                  
      for page in inner_html.xpath('//page'):
          data.append(page.text.strip())
    except IndexError:
      data = [inner_html.text.strip()]
  data = " ".join(data).replace("\n", "")
  return metadata, pattern.sub(lambda m: replace[re.escape(m.group(0))], data).upper()
  # data = pattern.sub(lambda m: replace[re.escape(m.group(0))], str(data)).upper()
  # return metadata, data#" ".join(data)

1. Some files have filename a link to the document itself instead of the filename.
2. Some files have html data in itself containing forms which is parsed; Others which looked to be corrupted data at the end (like link on row `63`, '64' in original dataframe) are ignored and not parsed.

In [28]:
def extract_data(df, skip_parsing=[], continue_parsing=[]):
  """
  Extract data by calling `sgml_parse`
  ### Returns 
      A dataframe with extracted data.
  
  ### Parameters: 
      `df`: A DataFrame containing 'text' column
      `skip_parsing` list for rows to skip parsing
      `continue_parsing` list for rows to continue parsing 
          whose data isn't in correct format but still parsable
  """
  extracted_data = pd.Series([], dtype=str)
  # tokens = pd.Series([], dtype='object')

  for index, row in enumerate(df['text']):
    if index not in skip_parsing:
      with console.status("Reading File {} of {}".format(index + 1, df.shape[0]), spinner="dots"):
        metadata, data = sgml_parser(row, index, continue_parsing)
        if data is '':                                               # if content is html text; then it will not be parsed 
          metadata, data = sgml_parser(row, index,                   # and thus data is empty str
                                       continue_parsing, html_parser=True) 
        
        if df['link'][index].split('/')[-1] == metadata['filename']: # Check if the file is exactly what we are looking for    
          extracted_data = extracted_data.append(                    # by comparing file name ( in "link" column) with metadata
                          pd.Series(data, index=[index], dtype=str)  # extract the tokens and append token and data
          ) 
        else:                                                        # OTHERWISE warn
          raise ValueError("UNCOMFIRMED FILE: Should be [{}] | Found [{}]".format(df['link'][index].split('/')[-1],
                                                                                  metadata['filename']))
          
        console.log("File Processed: {} [{}] ".format(index, metadata['filename']))
  df = df.assign(
      extracted_data = extracted_data
  )
  return df

Let's load the saved data and continue.

In [76]:
report_df = pd.read_json('/content/gdrive/MyDrive/{}/{}/report_links_df.json'.format(root_dir, prj_dir), lines=False)
report_df.head(3)

Unnamed: 0,CIK,CONAME,FYRMO,FDATE,FORM,link,text,SECFNAME
0,3662,SUNBEAM CORP/FL/,199803,889142400000,10-K405,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950170-98-000413.txt
1,3662,SUNBEAM CORP/FL/,199805,895190400000,10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950170-98-001001.txt
2,3662,SUNBEAM CORP/FL/,199808,902966400000,NT 10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950172-98-000783.txt


> **NOTE**: If you are using downloaded text and NOT the saved dataFrame; 
> **REPLACE THE `report_df` VARIABLE WITH `report_links_df**`.

In [77]:
report_df = extract_data(report_df, skip_parsing=[63, 64],
                  continue_parsing=[105])

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

In [78]:
report_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 152 entries, 0 to 151
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   CIK             152 non-null    int64 
 1   CONAME          152 non-null    object
 2   FYRMO           152 non-null    int64 
 3   FDATE           152 non-null    int64 
 4   FORM            152 non-null    object
 5   link            152 non-null    object
 6   text            152 non-null    object
 7   SECFNAME        152 non-null    object
 8   extracted_data  150 non-null    object
dtypes: int64(3), object(6)
memory usage: 16.9+ KB


In [79]:
report_df = report_df.dropna().reset_index()
report_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   index           150 non-null    int64 
 1   CIK             150 non-null    int64 
 2   CONAME          150 non-null    object
 3   FYRMO           150 non-null    int64 
 4   FDATE           150 non-null    int64 
 5   FORM            150 non-null    object
 6   link            150 non-null    object
 7   text            150 non-null    object
 8   SECFNAME        150 non-null    object
 9   extracted_data  150 non-null    object
dtypes: int64(4), object(6)
memory usage: 11.8+ KB


The Sentiment analysis needs to be done on **cleaned** text and so the text is tokenized cleaned if not found in stopwords.

In [80]:
@functools.lru_cache(maxsize=None)                              # to speed up cleaning of text by caching
def cleaned_text_tokens(data):
  return [word for word in nltk.word_tokenize(data) if not word in cachedStopWords]

tokens = lambda data: pd.Series(cleaned_text_tokens(data), dtype=str)
tokens_count = lambda data: pd.Series(cleaned_text_tokens(data), dtype=str).count()

positive_word_counter = lambda line: (tokens(line) == positive_words.any()).sum()
negative_word_counter = lambda line: (tokens(line) == negative_words.any()).sum()
uncertainity_word_counter = lambda line: (tokens(line) == uncertainity_words.any()).sum()
constraining_word_counter = lambda line: (tokens(line) == constraining_words.any()).sum()

In [81]:
report_df = report_df.assign(
    POSITIVE_SCORE = report_df['extracted_data'].astype(str).map(positive_word_counter),
    NEGATIVE_SCORE = report_df['extracted_data'].astype(str).map(negative_word_counter),
    UNCERTAINITY_SCORE = report_df['extracted_data'].astype(str).map(uncertainity_word_counter),
    CONSTRAINING_SCORE = report_df['extracted_data'].astype(str).map(constraining_word_counter)
)

In [82]:
report_df = report_df.assign(
    POLARITY_SCORE = (report_df.POSITIVE_SCORE - report_df.NEGATIVE_SCORE) / (report_df.POSITIVE_SCORE + report_df.NEGATIVE_SCORE + 0.000001),
)

In [83]:
report_df.head(3)

Unnamed: 0,index,CIK,CONAME,FYRMO,FDATE,FORM,link,text,SECFNAME,extracted_data,POSITIVE_SCORE,NEGATIVE_SCORE,UNCERTAINITY_SCORE,CONSTRAINING_SCORE,POLARITY_SCORE
0,0,3662,SUNBEAM CORP/FL/,199803,889142400000,10-K405,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950170-98-000413.txt,SUNBEAM CORPORATION AND SUBSIDIARIES ...,3,0,0,0,1.0
1,1,3662,SUNBEAM CORP/FL/,199805,895190400000,10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950170-98-001001.txt,SUNBEAM CORPORATION AND SUBSIDIARIES ...,3,0,0,0,1.0
2,2,3662,SUNBEAM CORP/FL/,199808,902966400000,NT 10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950172-98-000783.txt,SECURITIES AND EXCHANGE COMMISSION ...,0,0,0,0,0.0


## 2. Analysis of Readability


### Average Sentence Length

In [84]:
word_counter = lambda line: len(re.findall(r'\w+', line))
avg_sent_length_counter = lambda line: word_counter(line) / len(line.split("."))

report_df = report_df.assign(
    AVERAGE_SENTENCE_LENGTH = report_df['extracted_data'].astype(str).map(avg_sent_length_counter)
)

### Percentage of Complex words

In [85]:
worder = lambda line: re.findall(r'\w+', line)          # Create words list from single string input
cmplx_word_counter = lambda line: len([word for word in worder(line) if syllapy.count(word) > 2])
prct_cmplx_word = lambda line: cmplx_word_counter(line) / word_counter(line)

report_df = report_df.assign(
    PERCENTAGE_OF_COMPLEX_WORDS = report_df['extracted_data'].astype(str).map(prct_cmplx_word)
)

### Fog index

In [87]:
report_df = report_df.assign(
    FOG_INDEX = 0.4 * (report_df.AVERAGE_SENTENCE_LENGTH + report_df.PERCENTAGE_OF_COMPLEX_WORDS)
)
report_df.head(3)

Unnamed: 0,index,CIK,CONAME,FYRMO,FDATE,FORM,link,text,SECFNAME,extracted_data,POSITIVE_SCORE,NEGATIVE_SCORE,UNCERTAINITY_SCORE,CONSTRAINING_SCORE,POLARITY_SCORE,AVERAGE_SENTENCE_LENGTH,PERCENTAGE_OF_COMPLEX_WORDS,FOG_INDEX
0,0,3662,SUNBEAM CORP/FL/,199803,889142400000,10-K405,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950170-98-000413.txt,SUNBEAM CORPORATION AND SUBSIDIARIES ...,3,0,0,0,1.0,9.088671,0.248128,3.734719
1,1,3662,SUNBEAM CORP/FL/,199805,895190400000,10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950170-98-001001.txt,SUNBEAM CORPORATION AND SUBSIDIARIES ...,3,0,0,0,1.0,14.656132,0.233585,5.955887
2,2,3662,SUNBEAM CORP/FL/,199808,902966400000,NT 10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950172-98-000783.txt,SECURITIES AND EXCHANGE COMMISSION ...,0,0,0,0,0.0,36.619048,0.188557,14.723042


## 4. Complex Word Count


In [88]:
report_df = report_df.assign(
    COMPLEX_WORD_COUNT = report_df['extracted_data'].astype(str).map(cmplx_word_counter)
)
report_df.head(3)

Unnamed: 0,index,CIK,CONAME,FYRMO,FDATE,FORM,link,text,SECFNAME,extracted_data,POSITIVE_SCORE,NEGATIVE_SCORE,UNCERTAINITY_SCORE,CONSTRAINING_SCORE,POLARITY_SCORE,AVERAGE_SENTENCE_LENGTH,PERCENTAGE_OF_COMPLEX_WORDS,FOG_INDEX,COMPLEX_WORD_COUNT
0,0,3662,SUNBEAM CORP/FL/,199803,889142400000,10-K405,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950170-98-000413.txt,SUNBEAM CORPORATION AND SUBSIDIARIES ...,3,0,0,0,1.0,9.088671,0.248128,3.734719,37183
1,1,3662,SUNBEAM CORP/FL/,199805,895190400000,10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950170-98-001001.txt,SUNBEAM CORPORATION AND SUBSIDIARIES ...,3,0,0,0,1.0,14.656132,0.233585,5.955887,23615
2,2,3662,SUNBEAM CORP/FL/,199808,902966400000,NT 10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950172-98-000783.txt,SECURITIES AND EXCHANGE COMMISSION ...,0,0,0,0,0.0,36.619048,0.188557,14.723042,145


## 5. Word Count

In [89]:
report_df = report_df.assign(
    WORD_COUNT = report_df['extracted_data'].astype(str).map(word_counter)
)
report_df.head(3)

Unnamed: 0,index,CIK,CONAME,FYRMO,FDATE,FORM,link,text,SECFNAME,extracted_data,POSITIVE_SCORE,NEGATIVE_SCORE,UNCERTAINITY_SCORE,CONSTRAINING_SCORE,POLARITY_SCORE,AVERAGE_SENTENCE_LENGTH,PERCENTAGE_OF_COMPLEX_WORDS,FOG_INDEX,COMPLEX_WORD_COUNT,WORD_COUNT
0,0,3662,SUNBEAM CORP/FL/,199803,889142400000,10-K405,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950170-98-000413.txt,SUNBEAM CORPORATION AND SUBSIDIARIES ...,3,0,0,0,1.0,9.088671,0.248128,3.734719,37183,149854
1,1,3662,SUNBEAM CORP/FL/,199805,895190400000,10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950170-98-001001.txt,SUNBEAM CORPORATION AND SUBSIDIARIES ...,3,0,0,0,1.0,14.656132,0.233585,5.955887,23615,101098
2,2,3662,SUNBEAM CORP/FL/,199808,902966400000,NT 10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950172-98-000783.txt,SECURITIES AND EXCHANGE COMMISSION ...,0,0,0,0,0.0,36.619048,0.188557,14.723042,145,769


# Positive Word Proportion

In [90]:
report_df = report_df.assign(
    POSITIVE_WORD_PROPOTION = report_df.POSITIVE_SCORE / report_df.WORD_COUNT
)
report_df.head(3)

Unnamed: 0,index,CIK,CONAME,FYRMO,FDATE,FORM,link,text,SECFNAME,extracted_data,POSITIVE_SCORE,NEGATIVE_SCORE,UNCERTAINITY_SCORE,CONSTRAINING_SCORE,POLARITY_SCORE,AVERAGE_SENTENCE_LENGTH,PERCENTAGE_OF_COMPLEX_WORDS,FOG_INDEX,COMPLEX_WORD_COUNT,WORD_COUNT,POSITIVE_WORD_PROPOTION
0,0,3662,SUNBEAM CORP/FL/,199803,889142400000,10-K405,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950170-98-000413.txt,SUNBEAM CORPORATION AND SUBSIDIARIES ...,3,0,0,0,1.0,9.088671,0.248128,3.734719,37183,149854,2e-05
1,1,3662,SUNBEAM CORP/FL/,199805,895190400000,10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950170-98-001001.txt,SUNBEAM CORPORATION AND SUBSIDIARIES ...,3,0,0,0,1.0,14.656132,0.233585,5.955887,23615,101098,3e-05
2,2,3662,SUNBEAM CORP/FL/,199808,902966400000,NT 10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950172-98-000783.txt,SECURITIES AND EXCHANGE COMMISSION ...,0,0,0,0,0.0,36.619048,0.188557,14.723042,145,769,0.0


# Negative Word Proportion

In [91]:
report_df = report_df.assign(
    NEGATIVE_WORD_PROPORTION = report_df.NEGATIVE_SCORE / report_df.WORD_COUNT
)
report_df.head(3)

Unnamed: 0,index,CIK,CONAME,FYRMO,FDATE,FORM,link,text,SECFNAME,extracted_data,POSITIVE_SCORE,NEGATIVE_SCORE,UNCERTAINITY_SCORE,CONSTRAINING_SCORE,POLARITY_SCORE,AVERAGE_SENTENCE_LENGTH,PERCENTAGE_OF_COMPLEX_WORDS,FOG_INDEX,COMPLEX_WORD_COUNT,WORD_COUNT,POSITIVE_WORD_PROPOTION,NEGATIVE_WORD_PROPORTION
0,0,3662,SUNBEAM CORP/FL/,199803,889142400000,10-K405,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950170-98-000413.txt,SUNBEAM CORPORATION AND SUBSIDIARIES ...,3,0,0,0,1.0,9.088671,0.248128,3.734719,37183,149854,2e-05,0.0
1,1,3662,SUNBEAM CORP/FL/,199805,895190400000,10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950170-98-001001.txt,SUNBEAM CORPORATION AND SUBSIDIARIES ...,3,0,0,0,1.0,14.656132,0.233585,5.955887,23615,101098,3e-05,0.0
2,2,3662,SUNBEAM CORP/FL/,199808,902966400000,NT 10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950172-98-000783.txt,SECURITIES AND EXCHANGE COMMISSION ...,0,0,0,0,0.0,36.619048,0.188557,14.723042,145,769,0.0,0.0


# Uncertainity Word Proportion

In [92]:
report_df = report_df.assign(
    UNCERTAINITY_WORD_PROPORTION = report_df.UNCERTAINITY_SCORE / report_df.WORD_COUNT
)
report_df.head(3)

Unnamed: 0,index,CIK,CONAME,FYRMO,FDATE,FORM,link,text,SECFNAME,extracted_data,POSITIVE_SCORE,NEGATIVE_SCORE,UNCERTAINITY_SCORE,CONSTRAINING_SCORE,POLARITY_SCORE,AVERAGE_SENTENCE_LENGTH,PERCENTAGE_OF_COMPLEX_WORDS,FOG_INDEX,COMPLEX_WORD_COUNT,WORD_COUNT,POSITIVE_WORD_PROPOTION,NEGATIVE_WORD_PROPORTION,UNCERTAINITY_WORD_PROPORTION
0,0,3662,SUNBEAM CORP/FL/,199803,889142400000,10-K405,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950170-98-000413.txt,SUNBEAM CORPORATION AND SUBSIDIARIES ...,3,0,0,0,1.0,9.088671,0.248128,3.734719,37183,149854,2e-05,0.0,0.0
1,1,3662,SUNBEAM CORP/FL/,199805,895190400000,10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950170-98-001001.txt,SUNBEAM CORPORATION AND SUBSIDIARIES ...,3,0,0,0,1.0,14.656132,0.233585,5.955887,23615,101098,3e-05,0.0,0.0
2,2,3662,SUNBEAM CORP/FL/,199808,902966400000,NT 10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950172-98-000783.txt,SECURITIES AND EXCHANGE COMMISSION ...,0,0,0,0,0.0,36.619048,0.188557,14.723042,145,769,0.0,0.0,0.0


# Constraining Word Proportion

In [93]:
report_df = report_df.assign(
    CONSTRAINING_WORD_PROPORTION = report_df.CONSTRAINING_SCORE / report_df.WORD_COUNT
)
report_df.head(3)

Unnamed: 0,index,CIK,CONAME,FYRMO,FDATE,FORM,link,text,SECFNAME,extracted_data,POSITIVE_SCORE,NEGATIVE_SCORE,UNCERTAINITY_SCORE,CONSTRAINING_SCORE,POLARITY_SCORE,AVERAGE_SENTENCE_LENGTH,PERCENTAGE_OF_COMPLEX_WORDS,FOG_INDEX,COMPLEX_WORD_COUNT,WORD_COUNT,POSITIVE_WORD_PROPOTION,NEGATIVE_WORD_PROPORTION,UNCERTAINITY_WORD_PROPORTION,CONSTRAINING_WORD_PROPORTION
0,0,3662,SUNBEAM CORP/FL/,199803,889142400000,10-K405,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950170-98-000413.txt,SUNBEAM CORPORATION AND SUBSIDIARIES ...,3,0,0,0,1.0,9.088671,0.248128,3.734719,37183,149854,2e-05,0.0,0.0,0.0
1,1,3662,SUNBEAM CORP/FL/,199805,895190400000,10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950170-98-001001.txt,SUNBEAM CORPORATION AND SUBSIDIARIES ...,3,0,0,0,1.0,14.656132,0.233585,5.955887,23615,101098,3e-05,0.0,0.0,0.0
2,2,3662,SUNBEAM CORP/FL/,199808,902966400000,NT 10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950172-98-000783.txt,SECURITIES AND EXCHANGE COMMISSION ...,0,0,0,0,0.0,36.619048,0.188557,14.723042,145,769,0.0,0.0,0.0,0.0


# Constraining Words over the whole report

In [94]:
nltk_tokens = lambda data: pd.Series(nltk.word_tokenize(data.upper()), dtype=str) # these tokens don't have any stopwords removed
constraining_whole_report_word_counter = lambda line: (nltk_tokens(line) == constraining_words.any()).sum()

report_df = report_df.assign(
    CONSTRAINING_WORDS_WHOLE_REPORT = report_df['text'].astype(str).map(constraining_whole_report_word_counter)
)
report_df.head(3)

Unnamed: 0,index,CIK,CONAME,FYRMO,FDATE,FORM,link,text,SECFNAME,extracted_data,POSITIVE_SCORE,NEGATIVE_SCORE,UNCERTAINITY_SCORE,CONSTRAINING_SCORE,POLARITY_SCORE,AVERAGE_SENTENCE_LENGTH,PERCENTAGE_OF_COMPLEX_WORDS,FOG_INDEX,COMPLEX_WORD_COUNT,WORD_COUNT,POSITIVE_WORD_PROPOTION,NEGATIVE_WORD_PROPORTION,UNCERTAINITY_WORD_PROPORTION,CONSTRAINING_WORD_PROPORTION,CONSTRAINING_WORDS_WHOLE_REPORT
0,0,3662,SUNBEAM CORP/FL/,199803,889142400000,10-K405,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950170-98-000413.txt,SUNBEAM CORPORATION AND SUBSIDIARIES ...,3,0,0,0,1.0,9.088671,0.248128,3.734719,37183,149854,2e-05,0.0,0.0,0.0,0
1,1,3662,SUNBEAM CORP/FL/,199805,895190400000,10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950170-98-001001.txt,SUNBEAM CORPORATION AND SUBSIDIARIES ...,3,0,0,0,1.0,14.656132,0.233585,5.955887,23615,101098,3e-05,0.0,0.0,0.0,0
2,2,3662,SUNBEAM CORP/FL/,199808,902966400000,NT 10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,edgar/data/3662/0000950172-98-000783.txt,SECURITIES AND EXCHANGE COMMISSION ...,0,0,0,0,0.0,36.619048,0.188557,14.723042,145,769,0.0,0.0,0.0,0.0,0


# File Saving and info check.

In [98]:
report_df.drop(['text', 'link', 'extracted_data'], axis=1, inplace=True) # Drop the nonessential cols

In [100]:
# rearrage the cols
report_df = report_df[['index', 'CIK', 'CONAME', 'FYRMO', 'FDATE', 'FORM', 'SECFNAME',
                       'POSITIVE_SCORE', 'NEGATIVE_SCORE', 'AVERAGE_SENTENCE_LENGTH',
                       'PERCENTAGE_OF_COMPLEX_WORDS', 'FOG_INDEX', 'COMPLEX_WORD_COUNT',
                       'WORD_COUNT', 'UNCERTAINITY_SCORE', 'CONSTRAINING_SCORE', 
                       'POSITIVE_WORD_PROPOTION', 'NEGATIVE_WORD_PROPORTION',
                       'UNCERTAINITY_WORD_PROPORTION', 'CONSTRAINING_WORD_PROPORTION', 'CONSTRAINING_WORDS_WHOLE_REPORT']]

In [105]:
# lowercase the created cols 
for col_name in report_df.columns[7:]:
  report_df.rename({col_name: col_name.lower()}, axis=1, inplace=True)

In [106]:
report_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 21 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   index                            150 non-null    int64  
 1   CIK                              150 non-null    int64  
 2   CONAME                           150 non-null    object 
 3   FYRMO                            150 non-null    int64  
 4   FDATE                            150 non-null    int64  
 5   FORM                             150 non-null    object 
 6   SECFNAME                         150 non-null    object 
 7   positive_score                   150 non-null    int64  
 8   negative_score                   150 non-null    int64  
 9   average_sentence_length          150 non-null    float64
 10  percentage_of_complex_words      150 non-null    float64
 11  fog_index                        150 non-null    float64
 12  complex_word_count    

In [107]:
with open('/content/gdrive/MyDrive/{}/{}/final_report_df.csv'.format(root_dir, prj_dir), 'w') as f:
    f.write(report_df.to_csv())