Step 1.1: Setup Environment

In [None]:
# Install necessary packages
!pip install pyspark
!pip install s3fs
!pip install minio
!pip install pyhive

In [None]:
# Import libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode
from minio import Minio
import os

In [None]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("RaygunErrorTraceAnalysis") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", True) \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .enableHiveSupport() \
    .getOrCreate()

Step 1.2: Upload Multiple JSON Files to MinIO

In [None]:
# Local directory path containing JSON files
local_directory = "./data/raygun"

# MinIO bucket name
bucket_name = "raygun-analysis-bucket"

# Path to the JSON files in MinIO
json_files_path = "s3a://raygun-analysis-bucket/*.json"

In [None]:
# Initialize Minio client
minio_client = Minio(
    'localhost:9000',
    access_key='minioadmin',
    secret_key='minioadmin',
    secure=False
)

In [None]:
# Create the bucket if it doesn't exist
if not minio_client.bucket_exists(bucket_name):
    minio_client.make_bucket(bucket_name)

# Upload JSON files to MinIO
for filename in os.listdir(local_directory):
    if filename.endswith(".json"):
        file_path = os.path.join(local_directory, filename)
        minio_client.fput_object(bucket_name, filename, file_path)
        print(f"Uploaded {filename} to {bucket_name}")

Step 2: Read Multiple JSON Files from MinIO

In [None]:
# Read JSON files into DataFrame
df = spark.read.json(json_files_path)
df.printSchema()
df.show(truncate=False)

Step 3: Process JSON Data

In [None]:
# Flatten the nested structure for easier analysis
df_flattened = df.select(
    col("Error.Message").alias("ErrorMessage"),
    col("Error.ClassName").alias("ErrorClassName"),
    col("Error.FileName").alias("ErrorFileName"),
    explode("Error.StackTrace").alias("StackTrace"),
    col("MachineName"),
    col("Request.HostName").alias("RequestHostName"),
    col("Request.Url").alias("RequestUrl"),
    col("Request.HttpMethod").alias("RequestHttpMethod"),
    col("Request.IpAddress").alias("RequestIpAddress"),
    col("Request.QueryString"),
    col("Request.Headers"),
    col("Request.Data")
)

df_flattened.printSchema()
df_flattened.show(truncate=False)

Step 4: Save Data into Apache Hive

In [None]:
# Save the processed data into Hive table
df_flattened.write.mode("overwrite").saveAsTable("raygun_error_traces")

# Verify the data is saved correctly
spark.sql("SELECT * FROM raygun_error_traces LIMIT 10").show(truncate=False)

Step 5: Query with Trino

In [None]:
# Connect to Trino
conn = trino.connect(
    host='localhost',
    port=8080,
    user='trino',
    catalog='hive',
    schema='default',
)

# Create a cursor object using the cursor() method
cursor = conn.cursor()

# Execute a query
cursor.execute("SELECT * FROM raygun_error_traces LIMIT 10")

# Fetch the data
rows = cursor.fetchall()

# Display the data
for row in rows:
    print(row)