In [None]:
!pip install trino



In [None]:
import trino
from sklearn.ensemble import IsolationForest
import pandas as pd
from datetime import datetime

# Define connection parameters for Trino
trino_host = '18.188.112.55'
trino_port = 8080

# Trino connection function
def connect_trino():
    return trino.dbapi.connect(
        host=trino_host,
        port=trino_port,
        user="admin"
    )

# Query Trino for logs and metrics data
def query_trino_for_anomalies():
    conn = connect_trino()
    start_date = '2024-11-01'
    end_date = '2024-11-22'

    query_logs = f"""
    SELECT * FROM iceberg.apm_logs.logs
    WHERE createdTime >= DATE '{start_date}' AND createdTime < DATE '{end_date}'
    """
    query_metrics = f"""
    SELECT * FROM iceberg.apm_metrics.metrics
    WHERE createdTime >= DATE '{start_date}' AND createdTime < DATE '{end_date}'
    """

    logs_df = pd.read_sql(query_logs, conn)
    metrics_df = pd.read_sql(query_metrics, conn)

    logs_df['createdtime'] = pd.to_datetime(logs_df['createdtime'])
    metrics_df['createdtime'] = pd.to_datetime(metrics_df['createdtime'])
    return logs_df, metrics_df

# Data preparation and anomaly detection
logs_df, metrics_df = query_trino_for_anomalies()


# Map severity to numeric levels
log_levels = {'INFO': 0, 'WARNING': 1, 'ERROR': 2}
logs_df['severity_numeric'] = logs_df['severitytext'].map(log_levels)
logs_df['error_flag'] = logs_df['severitytext'].apply(lambda x: 1 if x == 'ERROR' else 0)
logs_df.set_index('createdtime', inplace=True)

# Aggregate logs and metrics data
logs_features = logs_df.resample('60Min').agg({'error_flag': 'sum'}).rename(columns={'error_flag': 'error_count'}).fillna(0)
metrics_df.set_index('createdtime', inplace=True)
metrics_features = metrics_df[['cpuusage', 'memoryusage']].resample('120Min').mean().fillna(0)




print(logs_df.head())
print(metrics_df.head())


  logs_df = pd.read_sql(query_logs, conn)
  metrics_df = pd.read_sql(query_metrics, conn)


                            servicename language  \
createdtime                                        
2024-11-07 11:24:26.179823  order-srv-1     java   
2024-11-07 11:30:17.035322  order-srv-1     java   
2024-11-07 11:34:20.490535  order-srv-1     java   
2024-11-07 11:34:20.492752  order-srv-1     java   
2024-11-07 11:34:20.500229  order-srv-1     java   

                                                     scopename  \
createdtime                                                      
2024-11-07 11:24:26.179823  com.zaga.OrderProj.OrderController   
2024-11-07 11:30:17.035322  com.zaga.OrderProj.OrderController   
2024-11-07 11:34:20.490535  com.zaga.OrderProj.OrderController   
2024-11-07 11:34:20.492752  com.zaga.OrderProj.OrderController   
2024-11-07 11:34:20.500229  com.zaga.OrderProj.OrderController   

                                                     traceid  \
createdtime                                                    
2024-11-07 11:24:26.179823  2f74cb14f60a7a02

In [None]:
print("Columns in logs_df:", logs_df.columns)

Columns in logs_df: Index(['servicename', 'language', 'scopename', 'traceid', 'spanid',
       'timeunixnano', 'observedtimeunixnano', 'severitytext',
       'severitynumber', 'body', 'attributes', 'severity_numeric',
       'error_flag'],
      dtype='object')


In [None]:
from sentence_transformers import SentenceTransformer

# Initialize embedding model
embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Combine log data for embedding
logs_df['embedding_input'] = (
    logs_df['servicename'].fillna('') + " " +
    logs_df['severitytext'].fillna('') + " " +
    logs_df['body'].fillna('') + " " +
    logs_df['attributes'].fillna('')
)


# Combine metrics data for embedding
metrics_df['embedding_input'] = (
    metrics_df['servicename'].fillna('') + " CPU usage: " +
    metrics_df['cpuusage'].astype(str) + " Memory usage: " +
    metrics_df['memoryusage'].astype(str)
)

# Generate embeddings
log_embeddings = embedding_model.encode(logs_df['embedding_input'].tolist(), convert_to_tensor=True)
metric_embeddings = embedding_model.encode(metrics_df['embedding_input'].tolist(), convert_to_tensor=True)

# Save embeddings for future use (optional)
import numpy as np
np.save("log_embeddings.npy", log_embeddings)
np.save("metric_embeddings.npy", metric_embeddings)


In [None]:
import torch
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# ------------------ Anomaly Detection for Metrics Data ------------------

# Extract relevant features from metrics
metrics_features = metrics_df[['cpuusage', 'memoryusage']]

# Train Isolation Forest for Metrics Data
metrics_isolation_forest = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)
metrics_anomalies = metrics_isolation_forest.fit_predict(metrics_features)

# Mark anomalies in the metrics dataframe
metrics_df['anomaly'] = (metrics_anomalies == -1).astype(int)

# ------------------ Anomaly Detection for Logs Data ------------------

# Prepare log data features for anomaly detection
log_features = logs_df[['severity_numeric', 'error_flag']].copy()

# Encode categorical variables (servicename and language) if needed
log_features['servicename_encoded'] = LabelEncoder().fit_transform(logs_df['servicename'].fillna(''))
log_features['language_encoded'] = LabelEncoder().fit_transform(logs_df['language'].fillna(''))

# Convert log_embeddings (tensor) to a NumPy array and then to a DataFrame
log_embeddings_np = log_embeddings.detach().numpy()
log_embeddings_df = pd.DataFrame(log_embeddings_np)

# Ensure unique index to avoid reindexing issues during concatenation
log_features.reset_index(drop=True, inplace=True)
log_embeddings_df.reset_index(drop=True, inplace=True)

# Combine log features and embeddings
log_features_combined = pd.concat(
    [log_features[['severity_numeric', 'error_flag', 'servicename_encoded', 'language_encoded']],
     log_embeddings_df], axis=1
)
# Ensure all column names are strings
log_features_combined.columns = log_features_combined.columns.astype(str)


# ------------------ Data Preprocessing and Imputation ------------------
# Use SimpleImputer to fill NaN values with the mean or median (or another strategy)
imputer = SimpleImputer(strategy='mean')
log_features_combined_imputed = imputer.fit_transform(log_features_combined)

# Verify if there are any NaN values remaining
print("After imputation, any NaNs left in the log features:", pd.isna(log_features_combined_imputed).sum())

# ------------------ Anomaly Detection with IsolationForest ------------------

# Train Isolation Forest for Logs Data
log_isolation_forest = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)

# Use a small subset for testing
log_features_subset = log_features_combined.sample(100, random_state=42)

# Ensure there are no NaN values left in the subset for anomaly detection
log_features_subset_imputed = imputer.transform(log_features_subset)

# Fit the IsolationForest model on the small subset
log_anomalies = log_isolation_forest.fit_predict(log_features_combined_imputed)

# Mark anomalies in the logs dataframe
logs_df['anomaly'] = (log_anomalies == -1).astype(int)

# ------------------ Output Results ------------------

# Print number of anomalies detected in each dataset
print(f"Anomalies in logs data: {logs_df['anomaly'].sum()}")
print(f"Anomalies in metrics data: {metrics_df['anomaly'].sum()}")




After imputation, any NaNs left in the log features: 0
Anomalies in logs data: 350
Anomalies in metrics data: 164


In [None]:
# ------------------ Print Anomalies Detected in Logs Data ------------------
print("Anomalies detected in logs data:")
logs_anomalies = logs_df[logs_df['anomaly'] == 1]
print(logs_anomalies)

# Alternatively, print each anomaly in logs data row by row
for index, row in logs_anomalies.iterrows():
    print(f"Log Anomaly {index}: {row.to_dict()}")

# ------------------ Print Anomalies Detected in Metrics Data ------------------
print("\nAnomalies detected in metrics data:")
metrics_anomalies = metrics_df[metrics_df['anomaly'] == 1]
print(metrics_anomalies)

# Alternatively, print each anomaly in metrics data row by row
for index, row in metrics_anomalies.iterrows():
    print(f"Metrics Anomaly {index}: {row.to_dict()}")


Anomalies detected in logs data:
                             servicename language  \
createdtime                                         
2024-11-07 04:00:26.549925  vendor-srv-1     java   
2024-11-07 04:00:26.552690  vendor-srv-1     java   
2024-11-07 04:03:03.728776  vendor-srv-1     java   
2024-11-07 04:03:03.731733  vendor-srv-1     java   
2024-11-05 12:21:14.468000  vendor-srv-2     java   
...                                  ...      ...   
2024-11-06 10:00:01.727915  vendor-srv-1     java   
2024-11-05 12:16:57.736000   order-srv-2     java   
2024-11-07 04:00:30.552328  vendor-srv-1     java   
2024-11-07 04:00:30.556612  vendor-srv-1     java   
2024-11-05 12:21:09.493000  vendor-srv-2     java   

                                                                    scopename  \
createdtime                                                                     
2024-11-07 04:00:26.549925               com.zaga.VendorProj.VendorController   
2024-11-07 04:00:26.552690        

In [None]:

print(logs_df.shape)

(7042, 15)


In [None]:
!pip install faiss-cpu




In [59]:
# Define the service name for which you want to see anomalies
service_name_to_filter = "order-srv-2"

# ------------------ Filter Logs Anomalies ------------------

# Ensure 'servicename' and 'anomaly' columns exist in logs_df
if 'servicename' in logs_df.columns and 'anomaly' in logs_df.columns:
    service_logs_anomalies = logs_df[(logs_df['servicename'] == service_name_to_filter) & (logs_df['anomaly'] == 1)]
    print(f"Anomalies in logs for service '{service_name_to_filter}':")
    print(service_logs_anomalies)
else:
    print(f"Error: 'servicename' or 'anomaly' column not found in logs_df.")

# ------------------ Filter Metrics Anomalies ------------------

# Ensure 'servicename' and 'anomaly' columns exist in metrics_df
if 'servicename' in metrics_df.columns and 'anomaly' in metrics_df.columns:
    service_metrics_anomalies = metrics_df[(metrics_df['servicename'] == service_name_to_filter) & (metrics_df['anomaly'] == 1)]
    print(f"\nAnomalies in metrics for service '{service_name_to_filter}':")
    print(service_metrics_anomalies)
else:
    print(f"Error: 'servicename' or 'anomaly' column not found in metrics_df.")


Anomalies in logs for service 'order-srv-2':
                            servicename language  \
createdtime                                        
2024-11-05 12:20:41.294000  order-srv-2     java   
2024-11-06 09:59:48.154230  order-srv-2     java   
2024-11-07 08:33:59.484745  order-srv-2     java   
2024-11-05 12:17:06.408000  order-srv-2     java   
2024-11-05 12:20:35.410000  order-srv-2     java   
2024-11-05 12:20:37.361000  order-srv-2     java   
2024-11-05 12:20:32.508000  order-srv-2     java   
2024-11-05 12:20:38.369000  order-srv-2     java   
2024-11-05 12:17:04.310000  order-srv-2     java   
2024-11-05 12:20:39.207000  order-srv-2     java   
2024-11-05 12:17:02.330000  order-srv-2     java   
2024-11-07 11:34:09.224870  order-srv-2     java   
2024-11-05 12:16:59.371000  order-srv-2     java   
2024-11-05 12:16:58.437000  order-srv-2     java   
2024-11-07 08:34:00.487454  order-srv-2     java   
2024-11-05 12:17:01.396000  order-srv-2     java   
2024-11-05 12:20:34

In [56]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

from IPython.display import display, HTML

# Function to display pop-up message
def show_popup(message):
    display(HTML(f"<div style='background-color: #f44336; color: white; padding: 10px; border-radius: 5px;'>"
                 f"<strong>Anomaly Detected!</strong><br>{message}</div>"))

# Load the embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Historical anomaly storage (for demonstration, using a set for quick lookup)
# In a real-world scenario, this could be loaded from a database or a file.
historical_anomaly_set = set()  # Store unique anomaly descriptions

# Combine both logs and metrics anomalies into a single list of text descriptions
new_anomalies = []  #

# Add anomalies from metrics
for _, row in metrics_df[metrics_df['anomaly'] == 1].iterrows():
    anomaly_message = f"Metrics anomaly: CPU usage {row['cpuusage']}%, Memory usage {row['memoryusage']}%"

    # Check if this anomaly is new
    if anomaly_message not in historical_anomaly_set:
        new_anomalies.append(anomaly_message)
        show_popup(anomaly_message)
        historical_anomaly_set.add(anomaly_message)

# Add anomalies from logs
for _, row in logs_df[logs_df['anomaly'] == 1].iterrows():
    anomaly_message = f"Log anomaly: Severity {row['severitytext']} (Numeric: {row['severity_numeric']}) with message: {row['body']}, Error flag: {row['error_flag']}"

    # Check if this anomaly is new
    if anomaly_message not in historical_anomaly_set:
        new_anomalies.append(anomaly_message)
        show_popup(anomaly_message)
        historical_anomaly_set.add(anomaly_message)

# Combine all anomalies (historical and new) for embedding
all_anomalies = list(historical_anomaly_set)

# Create embeddings for all anomalies
all_embeddings = embedding_model.encode(all_anomalies)

# Build a FAISS index to store these embeddings for fast retrieval
index = faiss.IndexFlatL2(all_embeddings.shape[1])
index.add(np.array(all_embeddings))

# Now the historical anomalies can be retrieved efficiently from this index


In [57]:
# Retrieve Relevant Anomalies During New Detection
# Function to get relevant anomalies from the historical dataset based on the new anomaly
def get_relevant_anomalies(new_anomaly_description):
    # Encode the new anomaly description into an embedding
    new_anomaly_embedding = embedding_model.encode([new_anomaly_description])

    # Use the FAISS index to find the most similar historical anomaly
    distances, indices = index.search(np.array(new_anomaly_embedding), k=3)

    # Get the actual relevant anomalies based on indices
    relevant_anomalies = [historical_anomalies[idx] for idx in indices[0]]
    return relevant_anomalies

# Example: New anomaly detection (replace with actual detection logic)
new_anomaly_description = "New anomaly detected: CPU usage 95% and Memory usage 80%"

# Get the most similar historical anomalies
relevant_anomalies = get_relevant_anomalies(new_anomaly_description)
print("Relevant Anomalies:", relevant_anomalies)


Relevant Anomalies: ['Metrics anomaly: CPU usage 0.00025002083506958916%, Memory usage 210926854144.0%', 'Metrics anomaly: CPU usage 0.00029296057587678917%, Memory usage 328733581312.0%', 'Log anomaly: Severity INFO (Numeric: 0.0) with message: Init duration for springdoc-openapi is: 138 ms, Error flag: 0']


In [58]:
# Generate Solutions with Hugging Face GPT-2
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load GPT-2 model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Function to generate solution using GPT-2
def generate_solution(anomaly_prompt):
    # Prepare the input prompt for GPT-2
    input_ids = tokenizer.encode(anomaly_prompt, return_tensors="pt")

    # Generate response using GPT-2
    outputs = model.generate(input_ids, max_length=150, num_return_sequences=1, no_repeat_ngram_size=2)

    # Decode and return the solution
    solution = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return solution

# Example: Generate solution for a new anomaly
anomaly_prompt = f"Anomaly detected: {new_anomaly_description}. Based on historical data, provide a possible solution to solve"
solution = generate_solution(anomaly_prompt)

print("Solution/Explanation from GPT-2:", solution)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Solution/Explanation from GPT-2: Anomaly detected: New anomaly detected: CPU usage 95% and Memory usage 80%. Based on historical data, provide a possible solution to solve the problem.

The problem is that the CPU is not able to handle the new data. The CPU can only handle a small amount of data at a time. This is why the system is unable to process the data properly. Therefore, the solution is to use a new CPU. In this case, we will use the following solution:
.NET Framework 4.5.1
, .NET Core 2.0.3
 and .Net Framework 3.6.2
In this example, I will be using the .net framework. I have used the latest version of the framework,
