In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import datetime as dt
import re
import json
import plotly.graph_objects as go
from plotly.subplots import make_subplots

## Getting data from Web API

Set parameters

In [36]:
#https://dstock.vndirect.com.vn/lich-su-gia/TCB
ticker = 'TCB'
start_date = '2020-01-01'
end_date = dt.datetime.strftime(dt.datetime.today(), '%Y-%m-%d')
delta = dt.datetime.strptime(end_date, '%Y-%m-%d') - dt.datetime.strptime(start_date, '%Y-%m-%d')

Web API

In [37]:
price_api = f'https://finfo-api.vndirect.com.vn/v4/stock_prices?sort=date&q=code:{ticker}~date:gte:{start_date}~date:lte:{end_date}&size={delta.days+1}&page=1'

Get data API

In [38]:
HEADERS = {'User-Agent': 'Mozilla'}
res = requests.get(price_api, headers=HEADERS)
res

<Response [200]>

In [39]:
res.text[:500]

'{"data":[{"code":"VND","date":"2022-04-08","time":"15:04:02","floor":"HOSE","type":"STOCK","basicPrice":35.0,"ceilingPrice":37.45,"floorPrice":32.55,"open":35.0,"high":36.0,"low":34.55,"close":35.1,"average":35.16,"adOpen":35.0,"adHigh":36.0,"adLow":34.55,"adClose":35.1,"adAverage":35.16,"nmVolume":1.79893E7,"nmValue":6.3251999E11,"ptVolume":0.0,"ptValue":0.0,"change":0.1,"adChange":0.1,"pctChange":0.2857},{"code":"VND","date":"2022-04-07","time":"15:04:01","floor":"HOSE","type":"STOCK","basicPr'

In [40]:
data = res.json()['data']  
data

[{'code': 'VND',
  'date': '2022-04-08',
  'time': '15:04:02',
  'floor': 'HOSE',
  'type': 'STOCK',
  'basicPrice': 35.0,
  'ceilingPrice': 37.45,
  'floorPrice': 32.55,
  'open': 35.0,
  'high': 36.0,
  'low': 34.55,
  'close': 35.1,
  'average': 35.16,
  'adOpen': 35.0,
  'adHigh': 36.0,
  'adLow': 34.55,
  'adClose': 35.1,
  'adAverage': 35.16,
  'nmVolume': 17989300.0,
  'nmValue': 632519990000.0,
  'ptVolume': 0.0,
  'ptValue': 0.0,
  'change': 0.1,
  'adChange': 0.1,
  'pctChange': 0.2857},
 {'code': 'VND',
  'date': '2022-04-07',
  'time': '15:04:01',
  'floor': 'HOSE',
  'type': 'STOCK',
  'basicPrice': 35.5,
  'ceilingPrice': 37.95,
  'floorPrice': 33.05,
  'open': 35.5,
  'high': 37.0,
  'low': 35.0,
  'close': 35.0,
  'average': 36.02,
  'adOpen': 35.5,
  'adHigh': 37.0,
  'adLow': 35.0,
  'adClose': 35.0,
  'adAverage': 36.02,
  'nmVolume': 23236900.0,
  'nmValue': 836968205000.0,
  'ptVolume': 0.0,
  'ptValue': 0.0,
  'change': -0.5,
  'adChange': -0.5,
  'pctChange': -1.

Transform Json to Pandas DataFrame

In [41]:
data = pd.DataFrame(data)
data.head()

Unnamed: 0,code,date,time,floor,type,basicPrice,ceilingPrice,floorPrice,open,high,...,adLow,adClose,adAverage,nmVolume,nmValue,ptVolume,ptValue,change,adChange,pctChange
0,VND,2022-04-08,15:04:02,HOSE,STOCK,35.0,37.45,32.55,35.0,36.0,...,34.55,35.1,35.16,17989300.0,632520000000.0,0.0,0.0,0.1,0.1,0.2857
1,VND,2022-04-07,15:04:01,HOSE,STOCK,35.5,37.95,33.05,35.5,37.0,...,35.0,35.0,36.02,23236900.0,836968200000.0,0.0,0.0,-0.5,-0.5,-1.4085
2,VND,2022-04-06,15:04:02,HOSE,STOCK,34.6,37.0,32.2,34.0,36.0,...,34.0,35.5,35.23,18821500.0,663019700000.0,0.0,0.0,0.9,0.9,2.6012
3,VND,2022-04-05,15:04:02,HOSE,STOCK,33.9,36.25,31.55,34.1,36.1,...,34.1,34.6,35.2,17035800.0,599641800000.0,0.0,0.0,0.7,0.7,2.0649
4,VND,2022-04-04,15:04:02,HOSE,STOCK,31.7,33.9,29.5,32.4,33.9,...,32.35,33.9,33.55,28162500.0,944774600000.0,0.0,0.0,2.2,2.2,6.9401


In [42]:
data.columns

Index(['code', 'date', 'time', 'floor', 'type', 'basicPrice', 'ceilingPrice',
       'floorPrice', 'open', 'high', 'low', 'close', 'average', 'adOpen',
       'adHigh', 'adLow', 'adClose', 'adAverage', 'nmVolume', 'nmValue',
       'ptVolume', 'ptValue', 'change', 'adChange', 'pctChange'],
      dtype='object')

In [43]:
data_clean = data[['code', 'date', 'floor', 'close','nmVolume', 'nmValue', 'change', 'pctChange']]
data_clean['date'] = pd.to_datetime(data_clean['date'])
data_clean.set_index('code',inplace=True)
data_clean



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0_level_0,date,floor,close,nmVolume,nmValue,change,pctChange
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
VND,2022-04-08,HOSE,35.10,17989300.0,6.325200e+11,0.10,0.2857
VND,2022-04-07,HOSE,35.00,23236900.0,8.369682e+11,-0.50,-1.4085
VND,2022-04-06,HOSE,35.50,18821500.0,6.630197e+11,0.90,2.6012
VND,2022-04-05,HOSE,34.60,17035800.0,5.996418e+11,0.70,2.0649
VND,2022-04-04,HOSE,33.90,28162500.0,9.447746e+11,2.20,6.9401
...,...,...,...,...,...,...,...
VND,2020-01-08,HOSE,14.10,233880.0,3.267280e+09,-0.20,-1.3986
VND,2020-01-07,HOSE,14.30,68060.0,9.709350e+08,0.00,0.0000
VND,2020-01-06,HOSE,14.30,149130.0,2.127958e+09,-0.10,-0.6944
VND,2020-01-03,HOSE,14.40,163940.0,2.351222e+09,-0.05,-0.3460


In [44]:
data_clean.describe()

Unnamed: 0,close,nmVolume,nmValue,change,pctChange
count,560.0,560.0,560.0,560.0,560.0
mean,33.853661,4482693.0,208152500000.0,0.156518,0.460874
std,22.143207,4342875.0,240808400000.0,1.324064,3.005631
min,10.7,31640.0,438779500.0,-5.0,-9.8947
25%,13.65,809555.0,10590960000.0,-0.2625,-1.15355
50%,28.85,3757900.0,136920700000.0,0.05,0.1734
75%,50.6,6552900.0,333497300000.0,0.6,1.9706
max,85.2,32597700.0,1124438000000.0,5.5,9.9338


In [45]:
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 560 entries, VND to VND
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       560 non-null    datetime64[ns]
 1   floor      560 non-null    object        
 2   close      560 non-null    float64       
 3   nmVolume   560 non-null    float64       
 4   nmValue    560 non-null    float64       
 5   change     560 non-null    float64       
 6   pctChange  560 non-null    float64       
dtypes: datetime64[ns](1), float64(5), object(1)
memory usage: 35.0+ KB


In [46]:
data_clean.isnull().sum()

date         0
floor        0
close        0
nmVolume     0
nmValue      0
change       0
pctChange    0
dtype: int64

In [47]:
def make_graph(stock_data):
    fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing = .05) #, subplot_titles=("Historical Price", "Historical Volume")
    fig.add_trace(go.Scatter(x=data_clean.date, y=data_clean.close.astype("float"), name="Price"), row=1, col=1)
    fig.add_trace(go.Bar(x=data_clean.date, y=data_clean.nmVolume.astype("float"), name="Volume"), row=2, col=1)
#     fig.update_xaxes(title_text="Date", rangeslider= {'visible':False},row=1, col=1)
    fig.update_xaxes(title_text="Date",rangeslider= {'visible':True}, row=2, col=1)
#     fig.update_yaxes(title_text="Price", row=1, col=1)
#     fig.update_yaxes(title_text="Volume", row=2, col=1)
    fig.update_layout(showlegend=True,
    height=900,
    title='stock',
    xaxis_rangeslider_visible=False)
    fig.show()

In [48]:
make_graph(data_clean)

## Working with AWS S3

In [16]:
import boto3
from io import StringIO, BytesIO

In [17]:
# key_file  = pd.read_csv('D:/new_user_credentials.csv')
# ACCESS_KEY = key_file['Access key ID']
# SECRET_KEY = key_file['Secret access key']

In [18]:
# client = boto3.client(
# 's3',
# aws_access_key_id=ACCESS_KEY,
# aws_secret_access_key=SECRET_KEY)

In [27]:
bucket_name = 'stock-data-cuongnm'
# s3 = boto3.resource(
#     's3',
#     region_name='ap-southeast-1',
#     aws_access_key_id=ACCESS_KEY,
#     aws_secret_access_key=SECRET_KEY
# )
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name)

In [28]:
for obj in bucket.objects.all():
    print(obj.key)

TCB_stock_data_from_2020-01-01_to_2022-04-10.csv


In [21]:
print(bucket.objects.all())

s3.Bucket.objectsCollection(s3.Bucket(name='stock-data-cuongnm'), s3.ObjectSummary)


### Write file to S3

In [49]:
key = f'{ticker}_stock_data_from_{start_date}_to_{end_date}.csv' #.parquet
key

'VND_stock_data_from_2020-01-01_to_2022-04-10.csv'

In [50]:
out_buffer = StringIO() #BytesIO , StringIO
data_clean.to_csv(out_buffer) #to_parquet
bucket.put_object(Body=out_buffer.getvalue(), Key=key)

s3.Object(bucket_name='stock-data-cuongnm', key='VND_stock_data_from_2020-01-01_to_2022-04-10.csv')

### Read file from S3

In [53]:
bucket_obj = bucket.objects.filter(Prefix = 'VND')
objects = [obj for obj in bucket_obj]
objects

[s3.ObjectSummary(bucket_name='stock-data-cuongnm', key='VND_stock_data_from_2020-01-01_to_2022-04-10.csv')]

In [55]:
for obj in bucket.objects.all():
    print(obj.key)

TCB_stock_data_from_2020-01-01_to_2022-04-10.csv
VND_stock_data_from_2020-01-01_to_2022-04-10.csv


In [56]:
csv_obj = bucket.Object(key='TCB_stock_data_from_2020-01-01_to_2022-04-10.csv').get().get('Body').read().decode('utf-8')

In [58]:
csv_obj[:100]

'code,date,floor,close,nmVolume,nmValue,change,pctChange\r\nTCB,2022-04-08,HOSE,48.85,7030100.0,3443611'

In [59]:
data = StringIO(csv_obj)
df = pd.read_csv(data,delimiter = ',')

In [61]:
df.head()

Unnamed: 0,code,date,floor,close,nmVolume,nmValue,change,pctChange
0,TCB,2022-04-08,HOSE,48.85,7030100.0,344361100000.0,-0.45,-0.9128
1,TCB,2022-04-07,HOSE,49.3,5971000.0,297645900000.0,-0.6,-1.2024
2,TCB,2022-04-06,HOSE,49.9,9012200.0,443282200000.0,0.85,1.7329
3,TCB,2022-04-05,HOSE,49.05,5803000.0,285601600000.0,-0.6,-1.2085
4,TCB,2022-04-04,HOSE,49.65,9775600.0,488163100000.0,-0.65,-1.2922
