In [1]:
import pandas as pd
import os
from dotenv import load_dotenv
import logging

In [2]:
# Configure basic loggig
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

# Define constants and file paths
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
DATA_DIR = os.path.join(PROJECT_ROOT, 'data')
ENV_FILE_PATH = os.path.join(PROJECT_ROOT, '.env')
ANALYSIS_TRACKER_FILENAME = 'log_analysis_tracker.xlsx'
ANALYSIS_TRACKER_PATH = os.path.join(DATA_DIR, ANALYSIS_TRACKER_FILENAME)

# Column name in Excel containing remote log directory paths
LOG_PATH_COLUMN = 'remote_log_directory'

print(f"""
PROJECT_ROOT: {PROJECT_ROOT}
DATA_DIR: {DATA_DIR}
ENV_FILE_PATH: {ENV_FILE_PATH}
ANALYSIS_TRACKER_PATH: {ANALYSIS_TRACKER_PATH}
      """)


PROJECT_ROOT: /Users/benkaan/Desktop/projects/remote-log-analysis-automation
DATA_DIR: /Users/benkaan/Desktop/projects/remote-log-analysis-automation/data
ENV_FILE_PATH: /Users/benkaan/Desktop/projects/remote-log-analysis-automation/.env
ANALYSIS_TRACKER_PATH: /Users/benkaan/Desktop/projects/remote-log-analysis-automation/data/log_analysis_tracker.xlsx
      


In [3]:
def load_environment_variables(env_path: str) -> dict:
    """
    Loads required environment variables from a specified .env file.

    Args:
        env_path (str): The full path to the .env file.

    Returns:
        dict: A dictionary containing SSH credentials and hostname.

    Raises:
        FileNotFoundError: If the .env file is not found.
        ValueError: If any required environment variable is missing.
    """
    if not os.path.exists(env_path):
        error_msg = f".env file not found at {env_path}. Please create it based on .env.example"
        logging.error(error_msg)
        raise FileNotFoundError(error_msg)

    load_dotenv(dotenv_path=env_path)
    logging.info(f"Loading environment variables from: {env_path}")
    required_vars = ["SSH_HOSTNAME", "SSH_USERNAME", "SSH_PASSWORD"]
    env_vars = {var: os.getenv(var) for var in required_vars}

    missing_vars = [var for var, value in env_vars.items() if value is None]
    if missing_vars:
        error_msg = f"Missing required environment variables in {env_path}: {', '.join(missing_vars)}."
        logging.error(error_msg)
        raise ValueError(error_msg)

    logging.info("Environemnt variables are loaded successfully.")
    return env_vars

# --- Load the variables ---
try:
    ssh_config = load_environment_variables(ENV_FILE_PATH)
    logging.info(f"SSH Hostname from .env: {ssh_config.get('SSH_HOSTNAME')}")
except (FileNotFoundError, ValueError) as e:
    logging.error(f"Failed to load configuration: {e}")
    ssh_config = None


2025-05-01 11:22:54,144 - INFO - Loading environment variables from: /Users/benkaan/Desktop/projects/remote-log-analysis-automation/.env
2025-05-01 11:22:54,144 - INFO - Environemnt variables are loaded successfully.
2025-05-01 11:22:54,145 - INFO - SSH Hostname from .env: localhost


In [6]:
def get_analysis_tracker(filename: str, required_col: str) -> pd.DataFrame | None:
    """
    Loads the analysis tracker Excel file into a pandas DataFrame.
    Checks for the required column.

    Args:
        filename (str): The path to the Excel tracker file.
        required_col (str): The name of the column that must exist.

    Returns:
        pd.DataFrame or None: The loaded DataFrame, or None if loading fails.
    """
    if not os.path.exists(filename):
        logging.error(f"Tracker file not found: '{filename}'. Please create it.")
        return None

    logging.info(f"Loading tracker file: {filename}")
    try:
        df = pd.read_excel(filename, header=0)
        logging.info(f"Successfully loaded tracker with shape: {df.shape}")

        # Validate required column exists
        if required_col not in df.columns:
            logging.error(f"Tracker file '{filename}' is missing the required column: '{required_col}'")
            return None

        logging.info(f"Required columns '{required_col}' found in tracker.")
        return df
    except FileNotFoundError:
        logging.error(f"File not found error during pandas read: '{filename}'")
        return None
    except Exception as e:
        logging.error(f"Failed to read tracker file '{filename}': {e}")
        return None

# --- Load the tracker ---
analysis_df = get_analysis_tracker(ANALYSIS_TRACKER_PATH, LOG_PATH_COLUMN)

if analysis_df is not None:
    print("\n--- Tracker DataFrame Head ---")
    print(analysis_df.head())
    print("\n--- Tracker DataFrame Info ---")
    analysis_df.info()
else:
    print("\nFailed to load the analysis tracker DataFrame.")

2025-05-01 11:51:46,581 - INFO - Loading tracker file: /Users/benkaan/Desktop/projects/remote-log-analysis-automation/data/log_analysis_tracker.xlsx
2025-05-01 11:51:46,598 - INFO - Successfully loaded tracker with shape: (12, 4)
2025-05-01 11:51:46,600 - INFO - Required columns 'remote_log_directory' found in tracker.
2025-05-01 11:51:46,598 - INFO - Successfully loaded tracker with shape: (12, 4)
2025-05-01 11:51:46,600 - INFO - Required columns 'remote_log_directory' found in tracker.



--- Tracker DataFrame Head ---
                            remote_log_directory    project department  \
0    /logs/marketing/campaigns/job_segment_users  marketing  campaigns   
1       /logs/marketing/campaigns/job_update_crm  marketing  campaigns   
2      /logs/marketing/campaigns/job_email_blast  marketing  campaigns   
3      /logs/marketing/analytics/job_web_traffic  marketing  analytics   
4  /logs/marketing/analytics/job_conversion_rate  marketing  analytics   

              job_name  
0    job_segment_users  
1       job_update_crm  
2      job_email_blast  
3      job_web_traffic  
4  job_conversion_rate  

--- Tracker DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   remote_log_directory  12 non-null     object
 1   project               12 non-null     object
 2   department            12 non-null 