In [9]:
import sys
import os
sys.path.append(os.path.abspath('../src'))  # Add src directory to path

import pandas as pd
from sqlalchemy import create_engine, text
from utils.config_utils import load_config

# Get database configuration using the correct path to config.yaml
config = load_config("../config.yaml")  # Go up one directory to find config.yaml
db_config = config['database']

# Create SQLAlchemy engine
engine = create_engine(f"postgresql://{db_config['user']}:{db_config['password']}@"
                      f"{db_config['host']}:{db_config['port']}/{db_config['dbname']}")

# Read into pandas DataFrames using SQLAlchemy 2.0 syntax
with engine.connect() as connection:
    stocks_df = pd.read_sql_query(text("SELECT * FROM stocks_data"), connection)
    indices_df = pd.read_sql_query(text("SELECT * FROM indices_data"), connection)

# Now you can work with your DataFrames
print("\nStocks DataFrame Info:")
print(stocks_df.info())

print("\nIndices DataFrame Info:")
print(indices_df.info())


Stocks DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22440 entries, 0 to 22439
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   datetime   22434 non-null  datetime64[ns]
 1   open       22434 non-null  float64       
 2   high       22434 non-null  float64       
 3   low        22434 non-null  float64       
 4   close      22434 non-null  float64       
 5   adj_close  22434 non-null  float64       
 6   volume     22434 non-null  float64       
 7   symbol     22440 non-null  object        
dtypes: datetime64[ns](1), float64(6), object(1)
memory usage: 1.4+ MB
None

Indices DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14960 entries, 0 to 14959
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   datetime   14956 non-null  datetime64[ns]
 1   open       14956 non-null  float64       
 

In [10]:
stocks_df

Unnamed: 0,datetime,open,high,low,close,adj_close,volume,symbol
0,NaT,,,,,,,AAPL
1,2010-01-04,7.622500,7.660714,7.585000,7.643214,6.454506,493729600.0,AAPL
2,2010-01-05,7.664286,7.699643,7.616071,7.656429,6.465664,601904770.0,AAPL
3,2010-01-06,7.656429,7.686786,7.526786,7.534643,6.362819,552160000.0,AAPL
4,2010-01-07,7.562500,7.571429,7.466071,7.520714,6.351057,477131200.0,AAPL
...,...,...,...,...,...,...,...,...
22435,2024-11-11,148.680000,148.850000,143.570000,145.260000,145.260000,182325600.0,NVDA
22436,2024-11-12,146.780000,149.650000,146.010000,148.290000,148.290000,198634700.0,NVDA
22437,2024-11-13,149.070000,149.330000,145.900000,146.270000,146.270000,191903300.0,NVDA
22438,2024-11-14,147.640000,149.000000,145.550000,146.760000,146.760000,194463300.0,NVDA


In [11]:
stocks_df.dtypes

datetime     datetime64[ns]
open                float64
high                float64
low                 float64
close               float64
adj_close           float64
volume              float64
symbol               object
dtype: object

In [12]:
stocks_df.isna().sum()

datetime     6
open         6
high         6
low          6
close        6
adj_close    6
volume       6
symbol       0
dtype: int64

In [13]:
# 1. Get total count of NaN values in each column
print("NaN count in each column:")
print(stocks_df.isna().sum())

# 2. Get percentage of NaN values in each column
print("\nPercentage of NaN values in each column:")
print((stocks_df.isna().sum() / len(stocks_df)) * 100)

# 3. Show rows with any NaN values
print("\nSample of rows containing NaN values:")
print(stocks_df[stocks_df.isna().any(axis=1)].head())

# 4. Count of rows with NaN values
print("\nTotal rows with any NaN value:")
print(stocks_df.isna().any(axis=1).sum())

# 5. Check NaN values by symbol
print("\nNaN values by symbol:")
for symbol in stocks_df['symbol'].unique():
    symbol_df = stocks_df[stocks_df['symbol'] == symbol]
    print(f"\n{symbol}:")
    print(symbol_df.isna().sum())

# 6. Detailed info about the DataFrame
print("\nDataFrame Info:")
print(stocks_df.info())

NaN count in each column:
datetime     6
open         6
high         6
low          6
close        6
adj_close    6
volume       6
symbol       0
dtype: int64

Percentage of NaN values in each column:
datetime     0.026738
open         0.026738
high         0.026738
low          0.026738
close        0.026738
adj_close    0.026738
volume       0.026738
symbol       0.000000
dtype: float64

Sample of rows containing NaN values:
      datetime  open  high  low  close  adj_close  volume symbol
0          NaT   NaN   NaN  NaN    NaN        NaN     NaN   AAPL
3735       NaT   NaN   NaN  NaN    NaN        NaN     NaN   AAPL
7480       NaT   NaN   NaN  NaN    NaN        NaN     NaN   MSFT
11215      NaT   NaN   NaN  NaN    NaN        NaN     NaN   MSFT
14960      NaT   NaN   NaN  NaN    NaN        NaN     NaN   NVDA

Total rows with any NaN value:
6

NaN values by symbol:

AAPL:
datetime     2
open         2
high         2
low          2
close        2
adj_close    2
volume       2
symbol    