In [1]:
# Import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import requests
import env

# import pandas_datareader.data as web
# import yfinance as yf

## Acquire Train Dataset Provided by Jane Street Market Prediction

In [None]:
%%time

# Acuqire the first 10_000 rows of the train dataset
df_train_sample = pd.read_csv("Database/train.csv", nrows=10000)

# Print the shape of the sample
df_train_sample.shape

In [None]:
# Print the first 2 records of the sample
df_train_sample.head(2)

In [None]:
# Print the last 2 records of the sample
df_train_sample.tail(2)

In [None]:
# Print the data types of the sample
df_train_sample.dtypes

In [None]:
# Is there a column called 'action' in the train dataset? 
'action' in df_train_sample.columns

**Takeaways**
1. `date` represents the day of the trade. 
2. `weight` and `resp` together represent a return on the trade.
3. `ts_id` represents a time ordering.
4. `resp` value as well as several other `resp_{1,2,3,4}` vlaues represent returns over different time horizons. 
5. However, the target variable `action` is not provided in the training dataset. 

### How to define the target variable `action`?
1. Simply to use `resp` and perform the trade if it's positive.
2. To use the product of `weight` and `resp` and perform the trade if it's positive. 

In [None]:
%%time

# Acquire the entire train dataset
df_train = pd.read_csv("Database/train.csv")

# Print the memory usage
df_train.info()

**Takeaways**: To save memory while maintaining all precision, cast the float64 columns to float32.

In [None]:
%%time

# Create an iterator containing the names of the float64 columns
colsf64 = df_train.select_dtypes(include='float64').columns
colsf64

In [None]:
# Create the dictionary of the data types
mapperf32 = {col: np.float32 for col in colsf64}

In [None]:
%%time

# Cast the float64 columns to float32
df_train = df_train.astype(mapperf32)

# Print the memory usage 
df_train.info()

In [None]:
%%time

# Create an iterator containing the names of the int64 columns
colsi64 = df_train.select_dtypes(include='int64').columns
colsi64

In [None]:
# Print the last 5 rows of the train dataset
df_train.tail(5)

In [None]:
# Does using int32 change the precision of ts_id? 

print('float\t\t bytes')
print(np.int64(2390490), '\t', np.int64(2390490).nbytes)
print(np.int32(2390490), '\t', np.int32(2390490).nbytes)
print(np.int16(2390490), '\t', np.int16(2390490).nbytes)

**Takeaways**: Using int32 doesn't change the precision of the values in column ts_id.

In [None]:
# Create the dictionary of the data types
mapperi32 = {col: np.int32 for col in colsi64}

In [None]:
%%time

# Cast the int64 columns to int32
df_train = df_train.astype(mapperi32)

# Print the memory usage 
df_train.info()

**Takeaways**: The memory usage drops from 2.5 GB to 1.2 GB after changing the data types. 

## Acquire data from Polygon API

In [19]:
# Make the HTTP request

polygon = env.polygon
url = 'https://api.polygon.io/v2/aggs/ticker/AAPL/range/1/day/2021-01-14/2021-01-15?apiKey='
url = url + polygon

response = requests.get(url)
response

<Response [200]>

**Break Down Polygon API**
1. Base url: 'https://api.polygon.io'
2. API version: '/v2'
3. Relative ulr for the specific API

In [20]:
# Get the query result in a list of dictionaries
response.json()['results']

[{'v': 91382447.0,
  'vw': 129.7361,
  'o': 130.8,
  'c': 128.91,
  'h': 131,
  'l': 128.76,
  't': 1610600400000}]

## Acquire the stock data from Yahoo finance
- yf.Ticker().history()
- yf.download
- pandas data reader

**Through `yf.Ticker().history()`**

In [None]:
# Get the Walmrt stock infomation

wmt = yf.Ticker("WMT") # the dtype of the wmt is a dictionary
wmt # Return a Ticker object

In [None]:
# Get market data in the past 5 days

hist = wmt.history(period="5d")
hist.head()

**Takeaways**
- The dtypes of the index is datetime.
- Don't have the adjusted close price.
- Doesn't support fetching data from multiple tickers.

**Fetching data for multiple tickers using `yf.download()`**

In [None]:
# Create a string of multiple tickers
tickers = 'AAPL WMT TSLA GE AMZN DB'

# Acquire the adjusted closing price

data = yf.download(tickers, '2020-12-01', '2020-12-04')['Adj Close']
data

**Takeaways**
- The order of the stocks changes to alphabetical order.
- The last day in the downloaded data is the previous day of end_date inputted in the yf.download method.  

**Use pandas datareader to read stock data from yahoo finance**

In [None]:
# Create a list of the stocks you are interested
stocks = ['AAPL', 'WMT', 'TSLA', 'GE', 'AMZN', 'DB']

# Specify the start date and end date

start_date = '2020-12-01'
end_date = '2020-12-03'

# Acquire the data
data = web.DataReader(stocks, data_source='yahoo', start=start_date, end=end_date)['Adj Close']
data

In [None]:
# Rename the columns

data.columns = stocks
data

**Takeaways**
- Take a little bit longer than the yf.download.
- The order of the stocks remain the same in the dataframe. 

**Build the helper function to fetch the data**

### Acquire metadata about the anonymized features

In [None]:
# Load metadata about features
df_features = pd.read_csv("Database/features.csv")
df_features.info()

In [None]:
# Print the first 5 rows
df_features.head()