# Process VN30F1M data

## Process missing data

In [1]:
# Call libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Get the absolute path of the `src` folder
src_path = os.path.abspath(os.path.join(os.getcwd(), "..", "src"))

# Add `src` to the system path
sys.path.insert(0, src_path)

In [2]:
df = pd.read_csv('../sample_data/VN30F1M.csv',sep='\t')
df.head()

Unnamed: 0,Ticker,DTYYYYMMDD,Open,High,Low,Close,Volume
0,HNX:VN301!,20170810,757.9,757.9,745.5,745.9,367.0
1,HNX:VN301!,20170811,746.0,746.4,743.4,745.7,510.0
2,HNX:VN301!,20170814,745.9,748.9,745.7,748.9,707.0
3,HNX:VN301!,20170815,750.3,750.9,748.0,748.0,946.0
4,HNX:VN301!,20170816,747.7,748.4,745.1,748.1,1065.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1893 entries, 0 to 1892
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Ticker      1893 non-null   object 
 1   DTYYYYMMDD  1893 non-null   int64  
 2   Open        1893 non-null   float64
 3   High        1893 non-null   float64
 4   Low         1893 non-null   float64
 5   Close       1893 non-null   float64
 6   Volume      1893 non-null   float64
dtypes: float64(5), int64(1), object(1)
memory usage: 103.7+ KB


In [5]:
from process.process_missing_data import process_missing_data

# Process missing data , extrapolation method is default
new_df = process_missing_data('../sample_data/VN30F1M.csv')
new_df.info()

Done! Missing data has been filled using the selected method.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1979 entries, 0 to 1978
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   DTYYYYMMDD  1979 non-null   datetime64[ns]
 1   Ticker      1979 non-null   object        
 2   Open        1979 non-null   float64       
 3   High        1979 non-null   float64       
 4   Low         1979 non-null   float64       
 5   Close       1979 non-null   float64       
 6   Volume      1979 non-null   float64       
dtypes: datetime64[ns](1), float64(5), object(1)
memory usage: 108.4+ KB


  df_filled.iloc[:, 1:] = df_filled.iloc[:, 1:].interpolate(method='linear', limit_direction='both')


In [6]:
from process.process_outliers import detect_outliers, process_outliers

# Detect outliers
detected_outliers_df = detect_outliers(new_df, 'second_largest',column='Close')

# Process outliers
processed_outliers_df = process_outliers(detected_outliers_df, 'replace_second_largest', column='Close')

In [8]:
current_dir = os.getcwd()  # Get the current working directory (where the notebook is running)
project_root = os.path.dirname(current_dir)  # Go up one level to the project root

# Define the path to data/processed relative to the project root
raw_data_path = os.path.join(project_root, "data", "processed")
os.makedirs(raw_data_path, exist_ok=True)  # Ensure the directory exists

# Define the file path for the CSV
file_path = os.path.join(raw_data_path, f"processed_VN30F1M_data.csv")
processed_outliers_df.to_csv(file_path, index=False, encoding='utf-8')