In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import datetime as dt
import re
import json
import plotly.graph_objects as go
from plotly.subplots import make_subplots

## Getting data from Web API

Set parameters

In [3]:
#https://dstock.vndirect.com.vn/lich-su-gia/TCB
ticker = 'TCB'
start_date = '2020-01-01'
end_date = dt.datetime.strftime(dt.datetime.today(), '%Y-%m-%d')
delta = dt.datetime.strptime(end_date, '%Y-%m-%d') - dt.datetime.strptime(start_date, '%Y-%m-%d')

Web API

In [4]:
price_api = f'https://finfo-api.vndirect.com.vn/v4/stock_prices?sort=date&q=code:{ticker}~date:gte:{start_date}~date:lte:{end_date}&size={delta.days+1}&page=1'

Get data API

In [5]:
HEADERS = {'User-Agent': 'Mozilla'}
res = requests.get(price_api, headers=HEADERS)
res

<Response [200]>

In [6]:
res.text[:500]

'{"data":[{"code":"TCB","date":"2023-06-30","time":"15:06:02","floor":"HOSE","type":"STOCK","basicPrice":32.65,"ceilingPrice":34.9,"floorPrice":30.4,"open":32.65,"high":32.8,"low":32.35,"close":32.35,"average":32.54,"adOpen":32.65,"adHigh":32.8,"adLow":32.35,"adClose":32.35,"adAverage":32.54,"nmVolume":2448500.0,"nmValue":7.967971E10,"ptVolume":314749.0,"ptValue":1.03624178E10,"change":-0.3,"adChange":-0.3,"pctChange":-0.9188},{"code":"TCB","date":"2023-06-29","time":"15:06:02","floor":"HOSE","ty'

In [7]:
data = res.json()['data']  
data[:500]

[{'code': 'TCB',
  'date': '2023-06-30',
  'time': '15:06:02',
  'floor': 'HOSE',
  'type': 'STOCK',
  'basicPrice': 32.65,
  'ceilingPrice': 34.9,
  'floorPrice': 30.4,
  'open': 32.65,
  'high': 32.8,
  'low': 32.35,
  'close': 32.35,
  'average': 32.54,
  'adOpen': 32.65,
  'adHigh': 32.8,
  'adLow': 32.35,
  'adClose': 32.35,
  'adAverage': 32.54,
  'nmVolume': 2448500.0,
  'nmValue': 79679710000.0,
  'ptVolume': 314749.0,
  'ptValue': 10362417800.0,
  'change': -0.3,
  'adChange': -0.3,
  'pctChange': -0.9188},
 {'code': 'TCB',
  'date': '2023-06-29',
  'time': '15:06:02',
  'floor': 'HOSE',
  'type': 'STOCK',
  'basicPrice': 33.3,
  'ceilingPrice': 35.6,
  'floorPrice': 31.0,
  'open': 33.3,
  'high': 33.4,
  'low': 32.65,
  'close': 32.65,
  'average': 32.86,
  'adOpen': 33.3,
  'adHigh': 33.4,
  'adLow': 32.65,
  'adClose': 32.65,
  'adAverage': 32.86,
  'nmVolume': 3538900.0,
  'nmValue': 116299680000.0,
  'ptVolume': 373283.0,
  'ptValue': 12345292600.0,
  'change': -0.65,
  

Transform Json to Pandas DataFrame

In [8]:
data = pd.DataFrame(data)
data.head()

Unnamed: 0,code,date,time,floor,type,basicPrice,ceilingPrice,floorPrice,open,high,...,adLow,adClose,adAverage,nmVolume,nmValue,ptVolume,ptValue,change,adChange,pctChange
0,TCB,2023-06-30,15:06:02,HOSE,STOCK,32.65,34.9,30.4,32.65,32.8,...,32.35,32.35,32.54,2448500.0,79679710000.0,314749.0,10362420000.0,-0.3,-0.3,-0.9188
1,TCB,2023-06-29,15:06:02,HOSE,STOCK,33.3,35.6,31.0,33.3,33.4,...,32.65,32.65,32.86,3538900.0,116299700000.0,373283.0,12345290000.0,-0.65,-0.65,-1.952
2,TCB,2023-06-28,15:06:02,HOSE,STOCK,32.95,35.25,30.65,32.95,33.3,...,32.75,33.3,32.99,4303400.0,141951100000.0,351432.0,11426110000.0,0.35,0.35,1.0622
3,TCB,2023-06-27,15:06:03,HOSE,STOCK,33.3,35.6,31.0,33.3,33.4,...,32.9,32.95,33.08,3665500.0,121267800000.0,348583.0,11465970000.0,-0.35,-0.35,-1.0511
4,TCB,2023-06-26,15:06:02,HOSE,STOCK,32.9,35.2,30.6,32.9,33.3,...,32.55,33.3,32.85,4044900.0,132887100000.0,251432.0,8096110000.0,0.4,0.4,1.2158


In [9]:
data.columns

Index(['code', 'date', 'time', 'floor', 'type', 'basicPrice', 'ceilingPrice',
       'floorPrice', 'open', 'high', 'low', 'close', 'average', 'adOpen',
       'adHigh', 'adLow', 'adClose', 'adAverage', 'nmVolume', 'nmValue',
       'ptVolume', 'ptValue', 'change', 'adChange', 'pctChange'],
      dtype='object')

In [10]:
data_clean = data[['code', 'date', 'floor', 'close','nmVolume', 'nmValue', 'change', 'pctChange']]
data_clean['date'] = pd.to_datetime(data_clean['date'])
data_clean.set_index('code',inplace=True)
data_clean

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['date'] = pd.to_datetime(data_clean['date'])


Unnamed: 0_level_0,date,floor,close,nmVolume,nmValue,change,pctChange
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TCB,2023-06-30,HOSE,32.35,2448500.0,7.967971e+10,-0.30,-0.9188
TCB,2023-06-29,HOSE,32.65,3538900.0,1.162997e+11,-0.65,-1.9520
TCB,2023-06-28,HOSE,33.30,4303400.0,1.419511e+11,0.35,1.0622
TCB,2023-06-27,HOSE,32.95,3665500.0,1.212678e+11,-0.35,-1.0511
TCB,2023-06-26,HOSE,33.30,4044900.0,1.328871e+11,0.40,1.2158
...,...,...,...,...,...,...,...
TCB,2020-01-08,HOSE,22.75,1710390.0,3.910131e+10,-0.45,-1.9397
TCB,2020-01-07,HOSE,23.20,1043300.0,2.411261e+10,0.15,0.6508
TCB,2020-01-06,HOSE,23.05,1162990.0,2.701257e+10,-0.60,-2.5370
TCB,2020-01-03,HOSE,23.65,756700.0,1.798952e+10,-0.15,-0.6303


In [11]:
data_clean.describe()

Unnamed: 0,close,nmVolume,nmValue,change,pctChange
count,872.0,872.0,872.0,872.0,872.0
mean,34.497248,8892899.0,348080000000.0,0.010321,0.064785
std,11.878072,7997155.0,376917800000.0,0.803531,2.360123
min,14.9,546010.0,9507343000.0,-4.0,-6.9966
25%,23.275,2950662.0,76085320000.0,-0.3,-0.93965
50%,32.675,6060250.0,207767000000.0,0.0,0.0
75%,48.5,13009400.0,497726200000.0,0.4,1.2646
max,58.0,58235500.0,3115112000000.0,3.7,6.9565


In [12]:
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 872 entries, TCB to TCB
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       872 non-null    datetime64[ns]
 1   floor      872 non-null    object        
 2   close      872 non-null    float64       
 3   nmVolume   872 non-null    float64       
 4   nmValue    872 non-null    float64       
 5   change     872 non-null    float64       
 6   pctChange  872 non-null    float64       
dtypes: datetime64[ns](1), float64(5), object(1)
memory usage: 54.5+ KB


In [13]:
data_clean.isnull().sum()

date         0
floor        0
close        0
nmVolume     0
nmValue      0
change       0
pctChange    0
dtype: int64

In [14]:
def make_graph(stock_data):
    fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing = .05) #, subplot_titles=("Historical Price", "Historical Volume")
    fig.add_trace(go.Scatter(x=data_clean.date, y=data_clean.close.astype("float"), name="Price"), row=1, col=1)
    fig.add_trace(go.Bar(x=data_clean.date, y=data_clean.nmVolume.astype("float"), name="Volume"), row=2, col=1)
#     fig.update_xaxes(title_text="Date", rangeslider= {'visible':False},row=1, col=1)
    fig.update_xaxes(title_text="Date",rangeslider= {'visible':True}, row=2, col=1)
#     fig.update_yaxes(title_text="Price", row=1, col=1)
#     fig.update_yaxes(title_text="Volume", row=2, col=1)
    fig.update_layout(showlegend=True,
    height=900,
    title='stock',
    xaxis_rangeslider_visible=False)
    fig.show()

In [15]:
make_graph(data_clean)

## Working with AWS S3

In [16]:
import boto3
from io import StringIO, BytesIO

In [17]:
# key_file  = pd.read_csv('D:/new_user_credentials.csv')
# ACCESS_KEY = key_file['Access key ID']
# SECRET_KEY = key_file['Secret access key']

In [18]:
# client = boto3.client(
# 's3',
# aws_access_key_id=ACCESS_KEY,
# aws_secret_access_key=SECRET_KEY)

In [19]:
bucket_name = 'stock-data-cuongnm'
# s3 = boto3.resource(
#     's3',
#     region_name='ap-southeast-1',
#     aws_access_key_id=ACCESS_KEY,
#     aws_secret_access_key=SECRET_KEY
# )
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name)

In [20]:
for obj in bucket.objects.all():
    print(obj.key)

NoCredentialsError: Unable to locate credentials

In [21]:
print(bucket.objects.all())

s3.Bucket.objectsCollection(s3.Bucket(name='stock-data-cuongnm'), s3.ObjectSummary)


### Write file to S3

In [22]:
key = f'{ticker}_stock_data_from_{start_date}_to_{end_date}.csv' #.parquet
key

'TCB_stock_data_from_2020-01-01_to_2023-07-02.csv'

In [23]:
out_buffer = StringIO() #BytesIO , StringIO
data_clean.to_csv(out_buffer) #to_parquet
bucket.put_object(Body=out_buffer.getvalue(), Key=key)

NoCredentialsError: Unable to locate credentials

### Read file from S3

In [53]:
bucket_obj = bucket.objects.filter(Prefix = 'VND')
objects = [obj for obj in bucket_obj]
objects

[s3.ObjectSummary(bucket_name='stock-data-cuongnm', key='VND_stock_data_from_2020-01-01_to_2022-04-10.csv')]

In [55]:
for obj in bucket.objects.all():
    print(obj.key)

TCB_stock_data_from_2020-01-01_to_2022-04-10.csv
VND_stock_data_from_2020-01-01_to_2022-04-10.csv


In [56]:
csv_obj = bucket.Object(key='TCB_stock_data_from_2020-01-01_to_2022-04-10.csv').get().get('Body').read().decode('utf-8')

In [58]:
csv_obj[:100]

'code,date,floor,close,nmVolume,nmValue,change,pctChange\r\nTCB,2022-04-08,HOSE,48.85,7030100.0,3443611'

In [59]:
data = StringIO(csv_obj)
df = pd.read_csv(data,delimiter = ',')

In [61]:
df.head()

Unnamed: 0,code,date,floor,close,nmVolume,nmValue,change,pctChange
0,TCB,2022-04-08,HOSE,48.85,7030100.0,344361100000.0,-0.45,-0.9128
1,TCB,2022-04-07,HOSE,49.3,5971000.0,297645900000.0,-0.6,-1.2024
2,TCB,2022-04-06,HOSE,49.9,9012200.0,443282200000.0,0.85,1.7329
3,TCB,2022-04-05,HOSE,49.05,5803000.0,285601600000.0,-0.6,-1.2085
4,TCB,2022-04-04,HOSE,49.65,9775600.0,488163100000.0,-0.65,-1.2922


Create .py file to schedule with AWS Lambda

In [None]:
import pandas as pd
import requests
import datetime as dt
import re
import json

import boto3
from io import StringIO, BytesIO

def lambda_handler(event, context,ticker = 'TCB',start_date = '2020-01-01',end_date = dt.datetime.strftime(dt.datetime.today(), '%Y-%m-%d')):

    #https://dstock.vndirect.com.vn/lich-su-gia/TCB
    # ticker = 'TCB'
    # start_date = '2020-01-01'
    # end_date = dt.datetime.strftime(dt.datetime.today(), '%Y-%m-%d')
    delta = dt.datetime.strptime(end_date, '%Y-%m-%d') - dt.datetime.strptime(start_date, '%Y-%m-%d')


    price_api = f'https://finfo-api.vndirect.com.vn/v4/stock_prices?sort=date&q=code:{ticker}~date:gte:{start_date}~date:lte:{end_date}&size={delta.days+1}&page=1'

    HEADERS = {'User-Agent': 'Mozilla'}
    res = requests.get(price_api, headers=HEADERS)
    data = res.json()['data']  
    data = pd.DataFrame(data)
    data['date_clean'] = pd.to_datetime(data['date'])

    data_clean = data[['code', 'date_clean', 'floor', 'close','nmVolume', 'nmValue', 'change', 'pctChange']]
    # data_clean['date'] = pd.to_datetime(data_clean['date'])
    data_clean.set_index('code',inplace=True)

    bucket_name = 'stock-data-cuongnm'
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(bucket_name)

    key = f'{ticker}_stock_data_from_{start_date}_to_{end_date}.csv' #.parquet

    out_buffer = StringIO() #BytesIO , StringIO
    data_clean.to_csv(out_buffer) #to_parquet
    bucket.put_object(Body=out_buffer.getvalue(), Key=key)

    return {
        'statusCode': 200,
        'body': json.dumps('file is created in:'+key)
    }



https://ap-southeast-1.console.aws.amazon.com/lambda/home?region=ap-southeast-1#/functions/get_stock_data?tab=code

https://ap-southeast-1.console.aws.amazon.com/events/home?region=ap-southeast-1#/rules

https://s3.console.aws.amazon.com/s3/buckets/stock-data-cuongnm?region=ap-southeast-1&tab=objects