In [12]:
import requests
import pandas as pd
from datetime import datetime, timedelta
from sqlalchemy import create_engine
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Constants
URL = "https://api-open.data.gov.sg/v2/real-time/api/relative-humidity"
START_DATE = datetime(2024, 9, 1)
END_DATE = datetime(2024, 9, 30)
DB_CONNECTION_STRING = 'postgresql://postgres:admin@localhost:5432/data_gov_project'

def fetch_humidity_data(current_datetime):
    date_str = current_datetime.strftime("%Y-%m-%d")
    params = {"date": date_str}
    
    try:
        response = requests.get(URL, params=params, timeout=10)
        response.raise_for_status()
        json_data = response.json()
        
        if json_data.get("code") == 0 and "data" in json_data:
            readings = json_data["data"].get("readings", [])
            return [
                {
                    'stationId': data.get("stationId"),
                    'humidity': data.get("value"),
                    'timestamp': reading["timestamp"]
                }
                for reading in readings
                for data in reading["data"]
            ]
        else:
            logging.warning(f"No humidity readings returned for {date_str}. Code: {json_data.get('code')}")
            return []
    except requests.RequestException as e:
        logging.error(f"Failed to fetch data for {date_str}: {str(e)}")
        return []

def process_data(start_date, end_date):
    date_range = [start_date + timedelta(days=i) for i in range((end_date - start_date).days + 1)]
    humidity_data = []
    
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_date = {executor.submit(fetch_humidity_data, date): date for date in date_range}
        for future in as_completed(future_to_date):
            humidity_data.extend(future.result())
    
    return humidity_data

def main():
    humidity_data = process_data(START_DATE, END_DATE)
    
    if humidity_data:
        df = pd.DataFrame(humidity_data)
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df = df.rename(columns={'stationId': 'station_id', 'timestamp': 'humidity_date', 'humidity': 'humidity_readings'})
        
        logging.info(f"Data shape: {df.shape}")
        logging.info(f"\n{df.head()}")
        logging.info(f"\n{df.info()}")
        
        try:
            engine = create_engine(DB_CONNECTION_STRING)
            df.to_sql('humidity', engine, if_exists='append', index=False)
            logging.info("Data successfully loaded into PostgreSQL.")
        except Exception as e:
            logging.error(f"Failed to load data into PostgreSQL: {str(e)}")
    else:
        logging.warning("No humidity data collected.")

if __name__ == "__main__":
    main()

2024-11-26 22:06:28,401 - INFO - Data shape: (41116, 3)
2024-11-26 22:06:28,402 - INFO - 
  station_id  humidity_readings             humidity_date
0       S108               77.9 2024-09-04 23:59:00+08:00
1       S109               74.3 2024-09-04 23:59:00+08:00
2       S117               82.7 2024-09-04 23:59:00+08:00
3       S107               78.5 2024-09-04 23:59:00+08:00
4       S104               89.7 2024-09-04 23:59:00+08:00
2024-11-26 22:06:28,406 - INFO - 
None


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41116 entries, 0 to 41115
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype                    
---  ------             --------------  -----                    
 0   station_id         41116 non-null  object                   
 1   humidity_readings  41116 non-null  float64                  
 2   humidity_date      41116 non-null  datetime64[ns, UTC+08:00]
dtypes: datetime64[ns, UTC+08:00](1), float64(1), object(1)
memory usage: 963.8+ KB


2024-11-26 22:06:29,152 - INFO - Data successfully loaded into PostgreSQL.
