In [2]:
import re

def extract_header_info(file_path):
    header_info = {}
    patterns = {
        'cik': r'CENTRAL INDEX KEY:\s+(\d+)',
        'conformed_period_report': r'CONFORMED PERIOD OF REPORT:\s+(\d+)',
        'filing_date': r'FILED AS OF DATE:\s+(\d+)',
        'company_name': r'COMPANY CONFORMED NAME:\s+([^\n\r]+)',
        'sic': r'STANDARD INDUSTRIAL CLASSIFICATION:\s+([^\n\r]+)',
        'form_type': r'FORM TYPE:\s+([^\n\r]+)'

    }

    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        
        for key, pattern in patterns.items():
            match = re.search(pattern, content)
            if match:
                header_info[key] = match.group(1).strip()
            else:
                header_info[key] = 'Not found'
    header_info['file_path'] = file_path
    return header_info


In [4]:
root_path = '/data/workspace_files/10X_cleaned'
dict_arr = []
for yr in [2016, 2017]:
    for qtr in ['QTR1', 'QTR2', 'QTR3', 'QTR4']:
        path = f'{root_path}/{str(yr)}/{qtr}'
        files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
        for fl in files:
            header = extract_header_info(f'{path}/{fl}')
            dict_arr.append(header)
        print(f'Done for {str(yr)} - {qtr}')

Done for 2016 - QTR1
Done for 2016 - QTR2
Done for 2016 - QTR3
Done for 2016 - QTR4
Done for 2017 - QTR1
Done for 2017 - QTR2
Done for 2017 - QTR3
Done for 2017 - QTR4


In [2]:
!pip install pyarrow
import pyarrow.parquet as pq
import pandas as pd
#df = pd.DataFrame(dict_arr)

#df.to_parquet('/data/workspace_files/10X_cleaned/Master_dictionary.parquet')

Collecting pyarrow
  Downloading pyarrow-15.0.0-cp38-cp38-manylinux_2_28_x86_64.whl (38.4 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/38.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/38.4 MB[0m [31m76.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/38.4 MB[0m [31m81.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/38.4 MB[0m [31m81.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/38.4 MB[0m [31m84.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.7/38.4 MB[0m [31m84.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m17.7/38.4 MB[0m

In [4]:
df = pq.read_table('/data/workspace_files/10X_cleaned/Master_dictionary.parquet').to_pandas()
df

Unnamed: 0,cik,conformed_period_report,filing_date,company_name,sic,form_type,file_path
0,0001553817,20160324,20160325,WFRBS COMMERCIAL MORTGAGE TRUST 2012-C8,ASSET-BACKED SECURITIES [6189],10-K,/data/workspace_files/10X_cleaned/2016/QTR1/20...
1,0000008177,20151231,20160329,ATLANTIC AMERICAN CORP,LIFE INSURANCE [6311],10-K,/data/workspace_files/10X_cleaned/2016/QTR1/20...
2,0001574085,20151231,20160315,"Ashford Hospitality Prime, Inc.",REAL ESTATE INVESTMENT TRUSTS [6798],10-K,/data/workspace_files/10X_cleaned/2016/QTR1/20...
3,0001305253,20151231,20160330,"Eiger BioPharmaceuticals, Inc.",BIOLOGICAL PRODUCTS (NO DIAGNOSTIC SUBSTANCES)...,10-K,/data/workspace_files/10X_cleaned/2016/QTR1/20...
4,0001318268,20151231,20160330,Madison Technologies Inc.,"MINING, QUARRYING OF NONMETALLIC MINERALS (NO ...",10-K,/data/workspace_files/10X_cleaned/2016/QTR1/20...
...,...,...,...,...,...,...,...
59036,0001103601,20170930,20171122,WGL HOLDINGS INC,NATURAL GAS DISTRIBUTION [4924],10-K,/data/workspace_files/10X_cleaned/2017/QTR4/20...
59037,0000733269,20170930,20171103,ACXIOM CORP,SERVICES-COMPUTER PROCESSING & DATA PREPARATIO...,10-Q,/data/workspace_files/10X_cleaned/2017/QTR4/20...
59038,0001014739,20170930,20171102,"BioScrip, Inc.",SERVICES-HOME HEALTH CARE SERVICES [8082],10-Q,/data/workspace_files/10X_cleaned/2017/QTR4/20...
59039,0001394319,20170930,20171107,"Tracon Pharmaceuticals, Inc.",BIOLOGICAL PRODUCTS (NO DIAGNOSTIC SUBSTANCES)...,10-Q,/data/workspace_files/10X_cleaned/2017/QTR4/20...


In [1]:
!pip install nltk

import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

# Download nltk resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')


def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [PorterStemmer().stem(token) for token in tokens if token.isalnum() and token not in stop_words]
    return tokens

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m71.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nltk
Successfully installed nltk-3.8.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


[nltk_data] Downloading package punkt to /home/datalore/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/datalore/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# Read the text files
with open('/data/workspace_files/10X_cleaned/2016/QTR2/20160512_10-Q_edgar_data_8177_0001140361-16-064724.txt', 'r', encoding='utf-8') as file:
    text1 = file.read()

with open('/data/workspace_files/10X_cleaned/2017/QTR2/20170512_10-Q_edgar_data_8177_0001140361-17-019978.txt', 'r', encoding='utf-8') as file:
    text2 = file.read()

# Preprocess text (optional, depending

# Tokenize the text into words
tokens1 = preprocess_text(text1)
tokens2 = preprocess_text(text2)

In [3]:
# Calculate edit distance
edit_distance = nltk.edit_distance(tokens1, tokens2)

print(f"Edit Distance: {edit_distance}")

Edit Distance: 4570


In [6]:
len(tokens1)

9238

In [7]:
tokens1

['header',
 'filestat',
 'filenam',
 '16',
 '064724',
 'txt',
 'filenam',
 'grossfiles',
 '5153583',
 'grossfiles',
 'netfiles',
 '83918',
 'netfiles',
 '236144',
 '2466500',
 '1523410',
 '667056',
 '9',
 'filestat',
 'sec',
 'header',
 '0001140361',
 '16',
 '064724',
 'hdr',
 'sgml',
 '20160512',
 'accept',
 'datetim',
 '20160512113818',
 'access',
 'number',
 '0001140361',
 '16',
 '064724',
 'conform',
 'submiss',
 'type',
 '10',
 'q',
 'public',
 'document',
 'count',
 '51',
 'conform',
 'period',
 'report',
 '20160331',
 'file',
 'date',
 '20160512',
 'date',
 'chang',
 '20160512',
 'filer',
 'compani',
 'data',
 'compani',
 'conform',
 'name',
 'atlant',
 'american',
 'corp',
 'central',
 'index',
 'key',
 '0000008177',
 'standard',
 'industri',
 'classif',
 'life',
 'insur',
 '6311',
 'ir',
 'number',
 '581027114',
 'state',
 'incorpor',
 'ga',
 'fiscal',
 'year',
 'end',
 '1231',
 'file',
 'valu',
 'form',
 'type',
 '10',
 'q',
 'sec',
 'act',
 '1934',
 'act',
 'sec',
 'file',
 

In [1]:
import difflib


# Preprocess text (optional, depending on your needs)
text1 = 'we expect prices to rise'
text2 = 'prices are expected to rise'

# Tokenize the text into lines
lines1 = text1.splitlines()
lines2 = text2.splitlines()

# Compute the differences between the two texts
differ = difflib.Differ()
diff = list(differ.compare(lines1, lines2))

# Count the number of added and deleted words
added_words = sum(line.split()[1:] for line in diff if line.startswith('+'))
deleted_words = sum(line.split()[1:] for line in diff if line.startswith('-'))

# Calculate the total count of changes, additions, and deletions
total_changes = len(added_words) + len(deleted_words)

# Calculate the average size of the old and new documents
average_size = (len(lines1) + len(lines2)) / 2

# Normalize the total count by the average size
similarity_measure = total_changes / average_size

print(f"Similarity Measure: {similarity_measure}")

TypeError: TypeError: unsupported operand type(s) for +: 'int' and 'list'

In [2]:
diff

['- we expect prices to rise', '+ prices are expected to rise']