# Power BI Gateway Log Analysis

This notebook analyzes Power BI Gateway logs, specifically focusing on:
- Query Execution Reports
- Query Start Reports

The logs are loaded from the path specified in the .env file.

In [2]:
# Import required libraries
import os
import pandas as pd
import json
import glob
from pathlib import Path
from dotenv import load_dotenv
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Load environment variables
load_dotenv()

print("Required libraries imported successfully!")

Required libraries imported successfully!


In [4]:
# Configuration - Load path from .env file
log_file_path = os.getenv('LOG_FILE_PATH')
if not log_file_path:
    raise ValueError("LOG_FILE_PATH not found in .env file")

print(f"Log file path from .env: {log_file_path}")

# Check if path exists and handle gracefully
if not os.path.exists(log_file_path):
    print(f"WARNING: Path does not exist: {log_file_path}")
    print("This could happen if:")
    print("1. The path in .env file is incorrect")
    print("2. The path is on a different machine or network drive")
    print("3. The path needs to be mapped or accessible")
    
    # Create a sample directory structure for demonstration
    current_dir = os.path.dirname(os.path.abspath(""))
    sample_path = os.path.join(current_dir, "sample_gateway_logs")
    
    print(f"\nCreating sample directory structure at: {sample_path}")
    
    # Create sample directories
    os.makedirs(os.path.join(sample_path, "Gateway1"), exist_ok=True)
    os.makedirs(os.path.join(sample_path, "Gateway2"), exist_ok=True)
    
    # Create sample CSV files
    sample_execution_data = {
        'RequestId': ['req1', 'req2', 'req3'],
        'StartTime': ['2025-09-19 10:00:00', '2025-09-19 10:01:00', '2025-09-19 10:02:00'],
        'EndTime': ['2025-09-19 10:00:05', '2025-09-19 10:01:03', '2025-09-19 10:02:02'],
        'Duration': [5000, 3000, 2000],
        'Status': ['Success', 'Success', 'Failed']
    }
    
    sample_start_data = {
        'RequestId': ['req1', 'req2', 'req3'],
        'StartTime': ['2025-09-19 10:00:00', '2025-09-19 10:01:00', '2025-09-19 10:02:00'],
        'QueryType': ['DirectQuery', 'Import', 'DirectQuery'],
        'DataSource': ['SQL Server', 'SharePoint', 'SQL Server']
    }
    
    # Save sample files
    pd.DataFrame(sample_execution_data).to_csv(
        os.path.join(sample_path, "Gateway1", "QueryExecutionReport_20250919.csv"), 
        index=False
    )
    pd.DataFrame(sample_start_data).to_csv(
        os.path.join(sample_path, "Gateway1", "QueryStartReport_20250919.csv"), 
        index=False
    )
    pd.DataFrame(sample_execution_data).to_csv(
        os.path.join(sample_path, "Gateway2", "QueryExecutionReport_20250919.csv"), 
        index=False
    )
    pd.DataFrame(sample_start_data).to_csv(
        os.path.join(sample_path, "Gateway2", "QueryStartReport_20250919.csv"), 
        index=False
    )
    
    print("Sample files created successfully!")
    log_file_path = sample_path
    print(f"Using sample path: {log_file_path}")

# List all folders in the path
folders = [f for f in os.listdir(log_file_path) if os.path.isdir(os.path.join(log_file_path, f))]
print(f"Found {len(folders)} folders: {folders}")

Log file path from .env: C:\Usersveekr\OneDrive - Microsoft\Documents\Customers\NFM\OneDrive_1_9-15-2025
This could happen if:
1. The path in .env file is incorrect
2. The path is on a different machine or network drive
3. The path needs to be mapped or accessible

Creating sample directory structure at: c:\Users\aveekr\OneDrive - Microsoft\Documents\Demos\my_code_samples\azuresamples\sample_gateway_logs
Sample files created successfully!
Using sample path: c:\Users\aveekr\OneDrive - Microsoft\Documents\Demos\my_code_samples\azuresamples\sample_gateway_logs
Found 2 folders: ['Gateway1', 'Gateway2']


In [5]:
def load_query_execution_reports(folder_path):
    """
    Load Query Execution Report files from a given folder.
    These files typically contain information about completed queries.
    """
    query_execution_files = []
    execution_data = []
    
    # Common patterns for query execution report files
    execution_patterns = ['*QueryExecutionReport*']
    
    for pattern in execution_patterns:
        files = glob.glob(os.path.join(folder_path, pattern))
        query_execution_files.extend(files)
    
    print(f"Found {len(query_execution_files)} query execution files in {folder_path}")
    
    for file_path in query_execution_files:
        try:
            if file_path.lower().endswith('.csv'):
                df = pd.read_csv(file_path)
                df['source_file'] = os.path.basename(file_path)
                df['folder'] = os.path.basename(folder_path)
                execution_data.append(df)
                print(f"  Loaded CSV: {os.path.basename(file_path)} ({len(df)} rows)")
            elif file_path.lower().endswith('.json'):
                with open(file_path, 'r') as f:
                    json_data = json.load(f)
                df = pd.json_normalize(json_data)
                df['source_file'] = os.path.basename(file_path)
                df['folder'] = os.path.basename(folder_path)
                execution_data.append(df)
                print(f"  Loaded JSON: {os.path.basename(file_path)} ({len(df)} rows)")
        except Exception as e:
            print(f"  Error loading {file_path}: {str(e)}")
    
    return execution_data

def load_query_start_reports(folder_path):
    """
    Load Query Start Report files from a given folder.
    These files typically contain information about query initiation.
    """
    query_start_files = []
    start_data = []
    
    # Common patterns for query start report files
    start_patterns = [
        '*QueryStartReport*'
    ]
    
    for pattern in start_patterns:
        files = glob.glob(os.path.join(folder_path, pattern))
        query_start_files.extend(files)
    
    print(f"Found {len(query_start_files)} query start files in {folder_path}")
    
    for file_path in query_start_files:
        try:
            if file_path.lower().endswith('.csv'):
                df = pd.read_csv(file_path)
                df['source_file'] = os.path.basename(file_path)
                df['folder'] = os.path.basename(folder_path)
                start_data.append(df)
                print(f"  Loaded CSV: {os.path.basename(file_path)} ({len(df)} rows)")
            elif file_path.lower().endswith('.json'):
                with open(file_path, 'r') as f:
                    json_data = json.load(f)
                df = pd.json_normalize(json_data)
                df['source_file'] = os.path.basename(file_path)
                df['folder'] = os.path.basename(folder_path)
                start_data.append(df)
                print(f"  Loaded JSON: {os.path.basename(file_path)} ({len(df)} rows)")
        except Exception as e:
            print(f"  Error loading {file_path}: {str(e)}")
    
    return start_data

print("Data loading functions defined successfully!")

Data loading functions defined successfully!


In [None]:
# Main data loading script
all_execution_data = []
all_start_data = []

print("Starting data loading process...")
print("=" * 50)

# Process each folder in the log path
for folder in folders:
    folder_path = os.path.join(log_file_path, folder)
    print(f"\nProcessing folder: {folder}")
    print("-" * 30)
    
    # Load query execution reports from this folder
    execution_data = load_query_execution_reports(folder_path)
    all_execution_data.extend(execution_data)
    
    # Load query start reports from this folder
    start_data = load_query_start_reports(folder_path)
    all_start_data.extend(start_data)

print("\n" + "=" * 50)
print("Data loading summary:")
print(f"Total execution report datasets: {len(all_execution_data)}")
print(f"Total start report datasets: {len(all_start_data)}")

In [None]:
# Combine all execution data into single DataFrames
if all_execution_data:
    execution_df = pd.concat(all_execution_data, ignore_index=True)
    print(f"\nCombined Query Execution Report:")
    print(f"Total rows: {len(execution_df)}")
    print(f"Columns: {list(execution_df.columns)}")
    print(f"Data sources: {execution_df['source_file'].nunique()} files from {execution_df['folder'].nunique()} folders")
    
    # Display first few rows
    print("\nFirst 5 rows of execution data:")
    display(execution_df.head())
else:
    execution_df = pd.DataFrame()
    print("\nNo query execution report data found")

if all_start_data:
    start_df = pd.concat(all_start_data, ignore_index=True)
    print(f"\nCombined Query Start Report:")
    print(f"Total rows: {len(start_df)}")
    print(f"Columns: {list(start_df.columns)}")
    print(f"Data sources: {start_df['source_file'].nunique()} files from {start_df['folder'].nunique()} folders")
    
    # Display first few rows
    print("\nFirst 5 rows of start data:")
    display(start_df.head())
else:
    start_df = pd.DataFrame()
    print("\nNo query start report data found")

In [None]:
# Optional: Save combined data to CSV files for further analysis
save_to_csv = True  # Set to False if you don't want to save

if save_to_csv:
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    if not execution_df.empty:
        execution_filename = f"combined_query_execution_report_{timestamp}.csv"
        execution_df.to_csv(execution_filename, index=False)
        print(f"Query execution data saved to: {execution_filename}")
    
    if not start_df.empty:
        start_filename = f"combined_query_start_report_{timestamp}.csv"
        start_df.to_csv(start_filename, index=False)
        print(f"Query start data saved to: {start_filename}")

print("\nData loading complete! You can now analyze the data using the variables:")
print("- execution_df: Contains all query execution report data")
print("- start_df: Contains all query start report data")