In [1]:

import pandas as pd
import os
from dotenv import load_dotenv
import logging
import paramiko
from datetime import date

# Configure basic loggig
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

# Define constants and file paths
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
DATA_DIR = os.path.join(PROJECT_ROOT, 'data')
ENV_FILE_PATH = os.path.join(PROJECT_ROOT, '.env')
ANALYSIS_TRACKER_FILENAME = 'log_analysis_tracker.xlsx'
ANALYSIS_TRACKER_PATH = os.path.join(DATA_DIR, ANALYSIS_TRACKER_FILENAME)


# Column name in Excel containing remote log directory paths
LOG_PATH_COLUMN = 'remote_log_directory'

# Define SFTP Port for Docker container connection
SFTP_PORT = 2222

# Base dir for local logs
LOCAL_LOG_STORAGE_BASE = os.path.join(DATA_DIR, 'downloaded_logs')


def load_environment_variables(env_path: str) -> dict:
    """
    Loads required environment variables from a specified .env file.

    Args:
        env_path (str): The full path to the .env file.

    Returns:
        dict: A dictionary containing SSH credentials and hostname.

    Raises:
        FileNotFoundError: If the .env file is not found.
        ValueError: If any required environment variable is missing.
    """
    if not os.path.exists(env_path):
        error_msg = f".env file not found at {env_path}. Please create it based on .env.example"
        logging.error(error_msg)
        raise FileNotFoundError(error_msg)

    load_dotenv(dotenv_path=env_path)
    logging.info(f"Loading environment variables from: {env_path}")
    required_vars = ["SSH_HOSTNAME", "SSH_USERNAME", "SSH_PASSWORD"]
    env_vars = {var: os.getenv(var) for var in required_vars}

    missing_vars = [var for var, value in env_vars.items() if value is None]
    if missing_vars:
        error_msg = f"Missing required environment variables in {env_path}: {', '.join(missing_vars)}."
        logging.error(error_msg)
        raise ValueError(error_msg)

    logging.info("Environemnt variables are loaded successfully.")
    return env_vars

In [2]:
def get_current_date_string() -> str:
    """Returns today's date as a string in 'YYYYMMDD' format."""
    return date.today().strftime('%Y%m%d')

def get_analysis_tracker(filename: str, required_col: str) -> pd.DataFrame | None:
    """Loads the analysis tracker Excel file into a pandas DataFrame."""
    if not os.path.exists(filename):
        logging.error(f"Tracker file not found: '{filename}'. Please create it.")
        return None

    logging.info(f"Loading tracker file: '{filename}'")
    try:
        df = pd.read_excel(filename, header=0)
        logging.info(f"Successfully loaded tracker with shape: {df.shape}")
        if required_col not in df.columns:
            logging.error(f"Tracker file '{filename}' is missing the required column: '{required_col}'")
            return None
        logging.info(f"Required column '{required_col}' found in tracker.")
        return df
    except Exception as e:
        logging.error(f"Failed to read tracker file '{filename}': {e}")
        return None


def create_log_download_directory(base_dir: str, date_string: str) -> str:
    """Creates the directory for storing logs downloaded on a specific date."""
    log_dir = os.path.join(base_dir, f"{date_string}_logs")
    if not os.path.exists(log_dir):
        try:
            os.makedirs(log_dir)
            logging.info(f"Created local log directory: {log_dir}")
        except OSError as e:
            logging.error(f"Failed to create directory {log_dir}: {e}")
            raise # Re-raise the error if directory creation fails
    return log_dir

# ## 5. Download Logs Logic

def download_latest_logs(df: pd.DataFrame, ssh_config: dict, local_log_dir: str) -> list:
    """
    Connects via SFTP and downloads the latest log file from each directory
    specified in the DataFrame to the local log directory.

    Args:
        df (pd.DataFrame): DataFrame containing log paths in the LOG_PATH_COLUMN.
        ssh_config (dict): Dictionary with SSH connection details.
        local_log_dir (str): Local directory to save downloaded logs.

    Returns:
        list: A list of remote paths that could not be accessed or processed.
    """
    problematic_remote_paths = []
    downloaded_log_filenames = set() # Track unique log filenames downloaded this run

    logging.info(f"Starting log download process...")

    try:
        # Use context manager for SSH connection
        with paramiko.SSHClient() as ssh_client:
            ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
            logging.info(
                f"Attempting SSH connection to: "
                f"{ssh_config.get('SSH_HOSTNAME')}:{SFTP_PORT} "
                f"as user '{ssh_config.get('SSH_USERNAME')}'..."
            )
            ssh_client.connect(
                hostname=ssh_config.get('SSH_HOSTNAME'),
                port=SFTP_PORT,
                username=ssh_config.get('SSH_USERNAME'),
                password=ssh_config.get('SSH_PASSWORD'),
                timeout=10
            )
            logging.info("SSH connection established successfully.")

            # Use context manager for SFTP session
            with ssh_client.open_sftp() as sftp_client:
                logging.info("SFTP session opened successfully.")

                # Iterate through DataFrame rows
                for index, row in df.iterrows():
                    remote_path = str(row[LOG_PATH_COLUMN]) if pd.notna(row[LOG_PATH_COLUMN]) else None

                    if not remote_path:
                        logging.warning(f"Skipping row {index}: Missing or invalid log path.")
                        continue

                    logging.info(f"Processing remote path: {remote_path}")
                    try:
                        # List files in the remote directory
                        remote_files = sftp_client.listdir(remote_path)
                        if not remote_files:
                            logging.warning(f"No log files found in: {remote_path}")
                            # Mark as problematic or handle as needed later
                            if remote_path not in problematic_remote_paths:
                                 problematic_remote_paths.append(remote_path) # Add path with no files
                            continue

                        # Find the latest log file (using max() assumes lexicographical sort works)
                        latest_log_filename = max(remote_files)
                        logging.info(f"Latest log file identified: {latest_log_filename}")

                        # Avoid downloading the same log file multiple times if different rows point to it
                        if latest_log_filename in downloaded_log_filenames:
                            logging.info(f"Skipping duplicate download for: {latest_log_filename}")
                            continue

                        # Construct full paths
                        # Ensure forward slashes for remote path, handle potential trailing slash
                        full_remote_log_path = f"{remote_path.rstrip('/')}/{latest_log_filename}"
                        local_log_path = os.path.join(local_log_dir, latest_log_filename)

                        logging.info(f"Attempting download: '{full_remote_log_path}' -> '{local_log_path}'")
                        sftp_client.get(full_remote_log_path, local_log_path)
                        downloaded_log_filenames.add(latest_log_filename)
                        logging.info(f"Successfully downloaded.")

                    except FileNotFoundError:
                         # Handle case where the remote directory itself doesn't exist
                         logging.error(f"Remote path not found: {remote_path}")
                         if remote_path not in problematic_remote_paths:
                             problematic_remote_paths.append(remote_path)
                    except IOError as io_err:
                        # Log specific SFTP/IO errors (e.g., permission denied)
                        logging.error(f"SFTP Error accessing {remote_path}: {io_err}")
                        if remote_path not in problematic_remote_paths:
                             problematic_remote_paths.append(remote_path)
                    except Exception as e:
                        # Catch other potential errors during file processing for this path
                        logging.error(f"Unexpected error processing {remote_path} or downloading {latest_log_filename if 'latest_log_filename' in locals() else 'unknown file'}: {e}")
                        if remote_path not in problematic_remote_paths:
                             problematic_remote_paths.append(remote_path)

    # Handle connection-level errors outside the loop
    except paramiko.AuthenticationException:
        logging.error("Authentication failed. Check username/password in .env file.")
        raise # Re-raise critical errors
    except paramiko.SSHException as ssh_ex:
        logging.error(f"SSH connection error: {ssh_ex}")
        raise
    except TimeoutError:
        logging.error("Connection timed out.")
        raise
    except EOFError as eof_err:
         logging.error(f"EOFError during SFTP setup: {eof_err}. Check SFTP subsystem/chroot config on server.")
         raise
    except Exception as e:
        logging.exception(f"An unexpected error occurred during connection or SFTP setup: {e}")
        raise

    logging.info("Log download process finished.")
    if problematic_remote_paths:
         unique_problems = sorted(list(set(problematic_remote_paths)))
         logging.warning(f"Could not access or process the following {len(unique_problems)} unique remote paths: {unique_problems}")
    return problematic_remote_paths


# Execute Download

# --- Load Config ---
try:
    ssh_config = load_environment_variables(ENV_FILE_PATH)
    analysis_df = get_analysis_tracker(ANALYSIS_TRACKER_PATH, LOG_PATH_COLUMN)
except (FileNotFoundError, ValueError) as e:
    logging.error(f"Failed to load configuration: {e}")
    ssh_config = None
    analysis_df = None

# --- Prepare Local Directory ---
if analysis_df is not None and ssh_config is not None:
    try:
        today_date_str = get_current_date_string()
        local_download_dir = create_log_download_directory(LOCAL_LOG_STORAGE_BASE, today_date_str)
        logging.info(f"Local directory for today's logs: {local_download_dir}")

        # --- Run the Download ---
        problem_paths = download_latest_logs(analysis_df, ssh_config, local_download_dir)

        print("\n--- Download Summary ---")
        if not problem_paths:
            print("All paths processed successfully.")
        else:
            print(f"Encountered issues with {len(problem_paths)} paths:")
            for path in problem_paths:
                print(f"- {path}")

    except Exception as main_err:
         logging.error(f"Error during main execution: {main_err}")
         print(f"\nERROR: Script execution failed: {main_err}")
else:
    logging.warning("Configuration or Tracker DataFrame not loaded. Skipping download.")
    print("\nSkipping download process due to configuration errors.")


2025-05-01 15:45:29,029 - INFO - Loading environment variables from: /Users/benkaan/Desktop/projects/remote-log-analysis-automation/.env
2025-05-01 15:45:29,030 - INFO - Environemnt variables are loaded successfully.
2025-05-01 15:45:29,031 - INFO - Loading tracker file: '/Users/benkaan/Desktop/projects/remote-log-analysis-automation/data/log_analysis_tracker.xlsx'
2025-05-01 15:45:29,145 - INFO - Successfully loaded tracker with shape: (12, 4)
2025-05-01 15:45:29,146 - INFO - Required column 'remote_log_directory' found in tracker.
2025-05-01 15:45:29,146 - INFO - Local directory for today's logs: /Users/benkaan/Desktop/projects/remote-log-analysis-automation/data/downloaded_logs/20250501_logs
2025-05-01 15:45:29,146 - INFO - Starting log download process...
2025-05-01 15:45:29,147 - INFO - Attempting SSH connection to: localhost:2222 as user 'sftpuser'...
2025-05-01 15:45:29,168 - INFO - Connected (version 2.0, client OpenSSH_9.2p1)
2025-05-01 15:45:29,373 - INFO - Authentication (pa


--- Download Summary ---
All paths processed successfully.
