In [1]:
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
from pyspark.sql.types import *
import requests

In [2]:
def init_spark():
    spark = SparkSession \
        .builder \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

# Data Preparation

### Dataset 1: NASDAQ & NYSE Stocks Histories

In [3]:
spark = init_spark()

22/04/05 22:55:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/05 22:55:32 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
companies = ['NVDA', 'GOOG', 'AAPL', 'AMZN', 'FB', 'MSFT', 'TXN',
             'AVGO', 'CSCO', 'ADBE', 'ORCL', 'CRM', 'QCOM', 'NFLX', 
             'BKNG', 'EXPE', 'INTC', 'INTU', 'FTNT', 'IBM', 'EBAY', 
             'HP', 'ADSK', 'EA', 'AMD']

In [5]:
schema = StructType([StructField('stock', StringType(), True),
                     StructField('date', DateType(), True),
                     StructField('volume', FloatType(), True),
                     StructField('open', FloatType(), True),
                     StructField('close', FloatType(), True),
                     StructField('high', FloatType(), True),
                     StructField('low', FloatType(), True),
                     StructField('adjclose', FloatType(), True)])

In [10]:
stock_histories = spark.createDataFrame([], schema)
for company in companies: 
    df = spark.read.csv(f'data/raw/nasdaq-and-nyse-stocks-histories/full_history/{company}.csv', header=True)
    df = df.withColumn("stock", lit(company))
    stock_histories = stock_histories.unionByName(df)

In [11]:
stock_histories.count()

                                                                                

164765

In [12]:
stock_histories.groupby("stock").count().show(25)

+-----+-----+
|stock|count|
+-----+-----+
|   EA| 7340|
| AAPL| 9556|
| CSCO| 7236|
| QCOM| 6775|
| EXPE| 3347|
| INTC| 9745|
|   HP| 9597|
| INTU| 6461|
| GOOG| 3579|
|   FB| 1627|
| NFLX| 4143|
|  IBM|12323|
| ADSK| 8408|
|  TXN|11711|
| AVGO| 2329|
| EBAY| 5062|
|  AMD| 9745|
| ORCL| 8232|
| AMZN| 5405|
| MSFT| 8231|
|  CRM| 3619|
| ADBE| 8125|
| BKNG| 4933|
| FTNT| 2256|
| NVDA| 4980|
+-----+-----+



In [13]:
stock_histories = stock_histories.drop("adjclose")
stock_histories.show(10)

+-----+----------+--------+------------------+------------------+------------------+------------------+
|stock|      date|  volume|              open|             close|              high|               low|
+-----+----------+--------+------------------+------------------+------------------+------------------+
| NVDA|2018-11-02|11320900|217.72999572753906| 214.9199981689453|             222.0| 210.2100067138672|
| NVDA|2018-11-01|14163200| 212.3000030517578|218.11000061035156|218.49000549316406|207.19000244140625|
| NVDA|2018-10-31|18644300|209.64999389648438| 210.8300018310547|212.58999633789062|204.00999450683594|
| NVDA|2018-10-30|20179800| 186.5500030517578|             203.0|203.39999389648438| 185.6199951171875|
| NVDA|2018-10-29|18950400|203.99000549316406| 185.6199951171875| 204.1300048828125|176.00999450683594|
| NVDA|2018-10-26|16619600|198.30999755859375| 198.2899932861328|204.83999633789062| 193.1199951171875|
| NVDA|2018-10-25|23793000|195.47000122070312|207.83999633789062

In [14]:
# stock_histories = stock_histories.filter(stock_histories.date > "2010-01-01")

In [15]:
stock_histories.count()

164765

In [16]:
stock_histories.groupby("stock").count().show(25)

+-----+-----+
|stock|count|
+-----+-----+
|   EA| 7340|
| AAPL| 9556|
| CSCO| 7236|
| QCOM| 6775|
| EXPE| 3347|
| INTC| 9745|
|   HP| 9597|
| INTU| 6461|
| GOOG| 3579|
|   FB| 1627|
| NFLX| 4143|
|  IBM|12323|
| ADSK| 8408|
|  TXN|11711|
| AVGO| 2329|
| EBAY| 5062|
|  AMD| 9745|
| ORCL| 8232|
| AMZN| 5405|
| MSFT| 8231|
|  CRM| 3619|
| ADBE| 8125|
| BKNG| 4933|
| FTNT| 2256|
| NVDA| 4980|
+-----+-----+



In [17]:
stock_histories.repartition(1).write.option("header",True)\
                                .csv('data/stock_histories.csv')

[Stage 53:>                                                         (0 + 1) / 1]                                                                                

### Dataset 2: SEC filings

In [None]:
report_keys = {
    'Total operating expenses': ['Total operating expenses', 
                                 'Total costs and expenses', 
                                 'Costs and expenses', 
                                 'Operating expenses'],
    'Gross profit': ['Gross profit', 'Gross margin', 'GROSS MARGIN', 'Total gross profit'],
    'Income from operations': ['Income from operations', 'Operating income', 'OPERATING INCOME'
                               'Income from continuing operations', 'Operating income (loss)'],
    'Net income': ['Net income', 'NET INCOME', 'Net income (loss)'],
    'Cost of revenue': ['Cost of revenue', 'Cost of revenues', 'Cost of sales',
                        'Total cost of sales', 'Total cost of revenues', 'Cost of net revenues']
}

In [None]:
def filter_financial_report(report):
    relevant_data_points = {}  
    for k, v in report_keys.items():
        for _ in report['ic']:
            if _['label'] in v:
                relevant_data_points[k] = _['value']                
    return relevant_data_points

In [None]:
def validity_check(report, data_point):
    if data_point in report.keys():
        try:
            return int(report[data_point])
        except:
            return None

In [None]:
def clean_financial_statements(response):
    cleaned_financial_statements = []
    for _ in response.json()['data']:
        report = filter_financial_report(_['report'])
        cleaned_financial_statement = {
            'symbol': _['symbol'],
            'year': _['year'],
            'quarter': _['quarter'],
            'total_operating_expenses': validity_check(report, 'Total operating expenses'),
            'gross_profit': validity_check(report, 'Gross profit'),
            'operating_income': validity_check(report, 'Income from operations'),
            'net_income': validity_check(report, 'Net income'),
            'cost_of_revenue': validity_check(report, 'Cost of revenue')
        }
        cleaned_financial_statements.append(cleaned_financial_statement)  
    return cleaned_financial_statements

In [None]:
def get_sec_filings(symbol):
    url = f'https://finnhub.io/api/v1/stock/financials-reported?symbol={symbol}&freq=quarterly'
    header = { 'X-Finnhub-Token' : 'c8v8n8aad3iaocnjthj0' }
    response = requests.get(url, headers=header)
    return clean_financial_statements(response)

In [None]:
all_sec_filings = []
for company in companies:
    sec_filings = get_sec_filings(company)
    all_sec_filings.extend(sec_filings)
    print(f'Completed extracting {company}')

In [None]:
all_sec_filings[0]

In [None]:
schema = StructType([StructField("symbol", StringType(), True),
                     StructField("year", IntegerType(), True),
                     StructField("quarter", IntegerType(), True),
                     StructField("total_operating_expenses", LongType(), True),
                     StructField("gross_profit", LongType(), True),
                     StructField("operating_income", LongType(), True),
                     StructField("net_income", LongType(), True),
                     StructField("cost_of_revenue", LongType(), True)])

In [None]:
sec_filings = spark.createDataFrame(all_sec_filings, schema=schema)

In [None]:
sec_filings.count()

In [None]:
sec_filings.groupby('symbol').count().show(25)

In [None]:
sec_filings.show(10)

In [None]:
sec_filings.repartition(1).write.option("header",True)\
                                .csv('data/sec_filings.csv')