<a href="https://colab.research.google.com/github/anagh07/stock_price_predictor/blob/colab/data_preparation_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
from pyspark.sql.types import *
import requests

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 40 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 63.9 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=1194d77502c77090472daf6f5d047230c8a20846bbeca1816fbd731253c9adb9
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [2]:
def init_spark():
    spark = SparkSession \
        .builder \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

# Data Preparation

### Dataset 1: NASDAQ & NYSE Stocks Histories

In [3]:
spark = init_spark()

In [4]:
companies = ['NVDA', 'GOOG', 'AAPL', 'AMZN', 'FB', 'MSFT', 'TXN',
             'AVGO', 'CSCO', 'ADBE', 'ORCL', 'CRM', 'QCOM', 'NFLX', 
             'BKNG', 'EXPE', 'INTC', 'INTU', 'FTNT', 'IBM', 'EBAY', 
             'HP', 'ADSK', 'EA', 'AMD']

In [5]:
schema = StructType([StructField('stock', StringType(), True),
                     StructField('date', DateType(), True),
                     StructField('volume', FloatType(), True),
                     StructField('open', FloatType(), True),
                     StructField('close', FloatType(), True),
                     StructField('high', FloatType(), True),
                     StructField('low', FloatType(), True),
                     StructField('adjclose', FloatType(), True)])

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
%cd drive/MyDrive/stocks_kaggle_dataset1/
%cd full_history/

/content/drive/MyDrive/stocks_kaggle_dataset1
/content/drive/MyDrive/stocks_kaggle_dataset1/full_history


In [8]:
stock_histories = spark.createDataFrame([], schema)
for company in companies: 
    df = spark.read.csv(f'/content/drive/MyDrive/stocks_kaggle_dataset1/full_history/{company}.csv', header=True)
    df = df.withColumn("stock", lit(company))
    stock_histories = stock_histories.unionByName(df)

In [9]:
stock_histories.count()

164765

In [None]:
stock_histories.groupby("stock").count().show(25)

+-----+-----+
|stock|count|
+-----+-----+
| NVDA| 4980|
| GOOG| 3579|
| AAPL| 9556|
| AMZN| 5405|
|   FB| 1627|
| MSFT| 8231|
|  TXN|11711|
| AVGO| 2329|
| CSCO| 7236|
| ADBE| 8125|
| ORCL| 8232|
|  CRM| 3619|
| QCOM| 6775|
| NFLX| 4143|
| BKNG| 4933|
| EXPE| 3347|
| INTC| 9745|
| INTU| 6461|
| FTNT| 2256|
|  IBM|12323|
| EBAY| 5062|
|   HP| 9597|
| ADSK| 8408|
|   EA| 7340|
|  AMD| 9745|
+-----+-----+



In [10]:
stock_histories = stock_histories.drop("adjclose")
stock_histories.show(10)

+-----+----------+--------+------------------+------------------+------------------+------------------+
|stock|      date|  volume|              open|             close|              high|               low|
+-----+----------+--------+------------------+------------------+------------------+------------------+
| NVDA|2018-11-02|11320900|217.72999572753906| 214.9199981689453|             222.0| 210.2100067138672|
| NVDA|2018-11-01|14163200| 212.3000030517578|218.11000061035156|218.49000549316406|207.19000244140625|
| NVDA|2018-10-31|18644300|209.64999389648438| 210.8300018310547|212.58999633789062|204.00999450683594|
| NVDA|2018-10-30|20179800| 186.5500030517578|             203.0|203.39999389648438| 185.6199951171875|
| NVDA|2018-10-29|18950400|203.99000549316406| 185.6199951171875| 204.1300048828125|176.00999450683594|
| NVDA|2018-10-26|16619600|198.30999755859375| 198.2899932861328|204.83999633789062| 193.1199951171875|
| NVDA|2018-10-25|23793000|195.47000122070312|207.83999633789062

In [None]:
# stock_histories = stock_histories.filter(stock_histories.date > "2010-01-01")

In [11]:
stock_histories.count()

164765

In [None]:
stock_histories.groupby("stock").count().show(25)

+-----+-----+
|stock|count|
+-----+-----+
| NVDA| 4980|
| GOOG| 3579|
| AAPL| 9556|
| AMZN| 5405|
|   FB| 1627|
| MSFT| 8231|
|  TXN|11711|
| AVGO| 2329|
| CSCO| 7236|
| ADBE| 8125|
| ORCL| 8232|
|  CRM| 3619|
| QCOM| 6775|
| NFLX| 4143|
| BKNG| 4933|
| EXPE| 3347|
| INTC| 9745|
| INTU| 6461|
| FTNT| 2256|
|  IBM|12323|
| EBAY| 5062|
|   HP| 9597|
| ADSK| 8408|
|   EA| 7340|
|  AMD| 9745|
+-----+-----+



In [12]:
stock_histories.repartition(1).write.option("header",True)\
                                .csv('/content/drive/MyDrive/data/stock_histories.csv')

### Dataset 2: SEC filings

In [None]:
report_keys = {
    'Total operating expenses': ['Total operating expenses', 
                                 'Total costs and expenses', 
                                 'Costs and expenses', 
                                 'Operating expenses'],
    'Gross profit': ['Gross profit', 'Gross margin', 'GROSS MARGIN', 'Total gross profit'],
    'Income from operations': ['Income from operations', 'Operating income', 'OPERATING INCOME'
                               'Income from continuing operations', 'Operating income (loss)'],
    'Net income': ['Net income', 'NET INCOME', 'Net income (loss)'],
    'Cost of revenue': ['Cost of revenue', 'Cost of revenues', 'Cost of sales',
                        'Total cost of sales', 'Total cost of revenues', 'Cost of net revenues']
}

In [None]:
def filter_financial_report(report):
    relevant_data_points = {}  
    for k, v in report_keys.items():
        for _ in report['ic']:
            if _['label'] in v:
                relevant_data_points[k] = _['value']                
    return relevant_data_points

In [None]:
def validity_check(report, data_point):
    if data_point in report.keys():
        try:
            return int(report[data_point])
        except:
            return None

In [None]:
def clean_financial_statements(response):
    cleaned_financial_statements = []
    for _ in response.json()['data']:
        report = filter_financial_report(_['report'])
        cleaned_financial_statement = {
            'symbol': _['symbol'],
            'year': _['year'],
            'quarter': _['quarter'],
            'total_operating_expenses': validity_check(report, 'Total operating expenses'),
            'gross_profit': validity_check(report, 'Gross profit'),
            'operating_income': validity_check(report, 'Income from operations'),
            'net_income': validity_check(report, 'Net income'),
            'cost_of_revenue': validity_check(report, 'Cost of revenue')
        }
        cleaned_financial_statements.append(cleaned_financial_statement)  
    return cleaned_financial_statements

In [None]:
def get_sec_filings(symbol):
    url = f'https://finnhub.io/api/v1/stock/financials-reported?symbol={symbol}&freq=quarterly'
    header = { 'X-Finnhub-Token' : 'c8v8n8aad3iaocnjthj0' }
    response = requests.get(url, headers=header)
    return clean_financial_statements(response)

In [None]:
all_sec_filings = []
for company in companies:
    sec_filings = get_sec_filings(company)
    all_sec_filings.extend(sec_filings)
    print(f'Completed extracting {company}')

In [None]:
all_sec_filings[0]

In [None]:
schema = StructType([StructField("symbol", StringType(), True),
                     StructField("year", IntegerType(), True),
                     StructField("quarter", IntegerType(), True),
                     StructField("total_operating_expenses", LongType(), True),
                     StructField("gross_profit", LongType(), True),
                     StructField("operating_income", LongType(), True),
                     StructField("net_income", LongType(), True),
                     StructField("cost_of_revenue", LongType(), True)])

In [None]:
sec_filings = spark.createDataFrame(all_sec_filings, schema=schema)

In [None]:
sec_filings.count()

In [None]:
sec_filings.groupby('symbol').count().show(25)

In [None]:
sec_filings.show(10)

In [None]:
sec_filings.repartition(1).write.option("header",True)\
                                .csv('data/sec_filings.csv')