In [15]:
#Author: Anthony Melendez

#!/usr/bin/env python
# coding: utf-8
from requests_html import HTMLSession
import pandas as pd 
from bs4 import BeautifulSoup
import re
import unicodedata
from io import StringIO
import time
import requests
import numpy as np

In [20]:
start_year = 2019
end_year = 2019
start_qtr = 1
end_qtr = 1

AllData = pd.DataFrame()
for year in range(start_year,end_year+1):
    
    for qtr in range(start_qtr,end_qtr+1):
        
        #Start a HTML session
        session = HTMLSession()
        
        #Scrape from website at specified year and quarter the crawler.idx file
        fwf_file = session.get(f'https://www.sec.gov/Archives/edgar/full-index/{year}/QTR{qtr}/crawler.idx').text
        
        #store fixed width file as a dataframe with respective column size from fwf
        URLs = pd.read_fwf(StringIO(fwf_file), colspecs=[(0,62),(62,74),(74,86),(86,98),(98,177)])
        

        #Rename the columns
        URLs.columns = ['Company_Name','Form_type','CIK','Date_Filed','URL']


        #Drop the description
        URLs.drop(URLs.index[0:4],inplace = True)

        #Filter out rows who's form type is not 10-Q or 10-K
        URLs = URLs.loc[URLs.Form_type.isin(['10-Q','10-K'])]
        
        #Reset the index and drop the index column
        URLs.reset_index(inplace = True,drop = True)
        
        #---------------------------------------------------------------------------------------------------------------
        #Check if works for first 10, if this is not set, it'll run for all the URLs in crawler.idx
        #600 returns 126 filled text files
        URLs = URLs.head(10) 
        #---------------------------------------------------------------------------------------------------------------
        
        #Define a list that will hold the text of the 10-K/10-Q documents
        financials_text = []
        
        for URL in URLs.URL:
            
            #Start a BeautifulSoup object
            soup = BeautifulSoup(session.get(URL).text,features = 'lxml')
            
            #Get the link from the table which is in the first table, second row, and third column
            link = soup.find_all('table')[0].find_all('tr')[1].find_all('td')[2].a.get('href')
        
            #Add the URL header to the document link
            financial_statement_URL = "https://www.sec.gov" + link
            
            #Scrape the 10-K/10-Q document with BeautifulSoup
            financials_soup = BeautifulSoup(session.get(financial_statement_URL).text,features = 'lxml')
            
            #Normalize to unicode
            financials = unicodedata.normalize("NFKD", financials_soup.text)
            
            #Find all of the instances of the following regular expression
            temp = re.findall('Item\s\d+\.\sManagement.s\sDiscussion\sand\sAnalysis.+Item\s\d\.',financials)
            
            #If found a match, append it, otherwise Nan
            if(temp):
                financials_text.append(temp[0])
            else:
                financials_text.append(np.nan)
                        
                    
            #sleep for a tenth of a second to avoid being blocked out of site
            time.sleep(0.1)
            
        #append the text to the dataframe as a 6th column
        URLs['MDA_text'] = financials_text
        
        #For every text entry that's not empty, write it to a text file on hard drive
        for index in URLs.loc[URLs.text.isnull() == False].index:
            with open(f'C:/Users/antho/OneDrive/Desktop/MSBA 502 - Analytical Programming/Y{year}_Q{qtr}_{URLs.iloc[index,:].Company_Name}.txt','w',encoding = 'utf8') as file:
                    file.write(URLs.iloc[index,:].text)

        #Merge the dataframe with data from previous quarters/years
        pd.concat([AllData, URLs])
        
        #sleep for a tenth of a second to avoid being blocked out of site
        time.sleep(0.1)
                

                             Company_Name Form_type      CIK  Date_Filed  \
0                   1 800 FLOWERS COM INC      10-Q  1084869  2019-02-08   
1  1347 Property Insurance Holdings, Inc.      10-K  1591890  2019-03-20   
2                1ST CONSTITUTION BANCORP      10-K  1141807  2019-03-15   
3                         1ST SOURCE CORP      10-K    34782  2019-02-22   
4             1st FRANKLIN FINANCIAL CORP      10-K    38723  2019-03-29   
5                22nd Century Group, Inc.      10-K  1347858  2019-03-06   
6                                2U, Inc.      10-K  1459417  2019-02-26   
7                    3AM TECHNOLOGIES INC      10-Q  1667615  2019-01-22   
8                         3D SYSTEMS CORP      10-K   910638  2019-02-28   
9                                   3M CO      10-K    66740  2019-02-07   

                                                 URL  \
0  https://www.sec.gov/Archives/edgar/data/108486...   
1  https://www.sec.gov/Archives/edgar/data/159189..

In [42]:
print(URLs.loc[URLs.text.isnull() == False])

                                Company_Name Form_type      CIK  Date_Filed  \
2                   1ST CONSTITUTION BANCORP      10-K  1141807  2019-03-15   
3                            1ST SOURCE CORP      10-K    34782  2019-02-22   
8                            3D SYSTEMS CORP      10-K   910638  2019-02-28   
12                              8X8 INC /DE/      10-Q  1023731  2019-01-30   
18                               AARON'S INC      10-K   706688  2019-02-14   
..                                       ...       ...      ...         ...   
580  Argo Group International Holdings, Ltd.      10-K  1091748  2019-02-26   
585          Armada Hoffler Properties, Inc.      10-K  1569187  2019-02-28   
592                             Ashford Inc.      10-K  1604738  2019-03-08   
594               Assertio Therapeutics, Inc      10-K  1005201  2019-03-11   
598                       Athene Holding Ltd      10-K  1527469  2019-02-27   

                                                   