# Azure VM Reliability Simulator

## 1. Create synthetic data with LLM Scenarios
## 2. Use Gradient Boosting to create a model with Time Series Split
## 3. Survival Analysis with Time_Varying Covariates
## 4. Streamlit Dashboard


### 1. Synthetic Data

In [29]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Initialize GPT-2
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium') 
model = GPT2LMHeadModel.from_pretrained('gpt2-medium') 

# Generate failure scenario using GPT-2
def generate_llm_failure_scenario():
    """Generate failure scenario using GPT-2"""
    prompt = "Azure VM failure scenario involving:"
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=100, do_sample=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Convert LLM description to telemetry signature
def scenario_to_pattern(scenario: str):
    """Convert LLM description to telemetry signature"""
    pattern = {
        'cpu_util': np.random.normal(40, 5),
        'mem_util': np.random.normal(50, 5),
        'disk_io': np.random.exponential(80),
        'net_latency': np.random.gamma(2, 10)
    }
    
    # Pattern adjustments based on keywords
    if 'CPU' in scenario or 'compute' in scenario:
        pattern['cpu_util'] = min(100, pattern['cpu_util'] * 1.8)
    if 'memory' in scenario or 'RAM' in scenario:
        pattern['mem_util'] = min(100, pattern['mem_util'] * 1.7)
    if 'disk' in scenario or 'storage' in scenario:
        pattern['disk_io'] = pattern['disk_io'] * 3.5
    if 'network' in scenario or 'latency' in scenario:
        pattern['net_latency'] = pattern['net_latency'] * 4.0
        
    return pattern

# Generate synthetic data
def generate_azure_vm_data(num_vms=1000, days=90):
    np.random.seed(42)
    vm_ids = [f"vm_{i:04d}" for i in range(num_vms)]
    dates = [datetime.now() - timedelta(days=x) for x in range(days)]
    
    # Pre-generate failure scenarios
    failure_scenarios = [generate_llm_failure_scenario() for _ in range(20)]
    failure_history = {vm: [] for vm in vm_ids}  # Track failure clusters

    data = []
    for vm_id in vm_ids:
        sys_failures = 0  # Failure counter per VM
        for i, date in enumerate(dates):
            # Apply failure history decay (temporal dependency)
            cluster_effect = 0.8 ** sys_failures
            
            if np.random.random() < (0.0008 * cluster_effect):  # Failure event
                scenario = np.random.choice(failure_scenarios)
                pattern = scenario_to_pattern(scenario)
                sys_failures += 1
                failure_history[vm_id].append((date, scenario))
            else:
                # Baseline with failure history influence
                stress_factor = 1 + (0.3 * cluster_effect)
                pattern = {
                    'cpu_util': min(100, np.random.normal(40, 10) * stress_factor),
                    'mem_util': min(100, np.random.normal(50, 10) * stress_factor),
                    'disk_io': np.random.exponential(100) * stress_factor,
                    'net_latency': np.random.gamma(2, 15) * stress_factor
                }
                
            data.append({
                "vm_id": vm_id,
                "timestamp": date,
                "sys_failures": sys_failures,
                **pattern
            })
            
            # Decay failure counter weekly
            if i % 7 == 0:
                sys_failures = max(0, sys_failures - 1)
                
    return pd.DataFrame(data), failure_history

# Generate data
df, failure_history = generate_azure_vm_data()
df.to_csv("azure_vm_telemetry_enhanced.csv", index=False)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

### 2. Gradient Boosting

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score
import shap
import matplotlib.pyplot as plt

# Feature engineering
df['hour'] = pd.to_datetime(df['timestamp']).dt.hour
df['is_peak'] = df['hour'].between(8, 18).astype(int)
df['cpu_mem_ratio'] = df['cpu_util'] / (df['mem_util'] + 1e-5)
df['failure'] = ((df['cpu_util'] > 90) & (df['disk_io'] > 250)).astype(int)

# Time-based split
X = df[['cpu_util', 'mem_util', 'disk_io', 'net_latency', 'is_peak', 'cpu_mem_ratio', 'sys_failures']]
y = df['failure']
tss = TimeSeriesSplit(n_splits=3)

for train_index, test_index in tss.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = XGBClassifier(n_estimators=200, max_depth=7, learning_rate=0.1)
    model.fit(X_train, y_train)
    
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(f"Fold Accuracy: {acc:.4f}")

# SHAP explainer
explainer = shap.Explainer(model)
shap_values = explainer(X_test)

# Visualize feature impacts 
# Simple beeswarm plot
plt.figure(figsize=(10, 6))
shap.plots.beeswarm(shap_values, show=False)
plt.title("Feature Impact on Failure Probability")
plt.tight_layout()
plt.savefig('feature_impact.png')


Fold Accuracy: 0.9998
Fold Accuracy: 0.9996
Fold Accuracy: 0.9997


In [32]:
# Waterfall plot for specific VM (e.g., first in test set)
# Reset index to ensure we have access to the original data
X_test = X_test.reset_index(drop=False)

# Get VM ID for the first sample
vm_id = df.loc[X_test.index[0], 'vm_id']

# Plot
plt.figure(figsize=(10, 6))
shap.plots.waterfall(shap_values[1], show=False)
plt.title(f"Prediction Breakdown for VM: {vm_id} ")
plt.tight_layout()
plt.savefig('shap_waterfall.png')

In [33]:
print(f"SHAP values type: {type(shap_values)}")
print(f"SHAP values shape: {shap_values.shape}")
print(f"SHAP values content: {shap_values}")

SHAP values type: <class 'shap._explanation.Explanation'>
SHAP values shape: (22500, 7)
SHAP values content: .values =
array([[-0.9474661 , -0.827411  , -0.01655163, ...,  0.        ,
        -1.6715472 ,  0.        ],
       [-0.91999   , -0.8168504 , -0.93316865, ...,  0.        ,
        -1.0839238 ,  0.        ],
       [-0.8896795 , -0.7002088 , -0.9466994 , ...,  0.        ,
        -1.5178065 ,  0.        ],
       ...,
       [-0.92956054, -0.26610658, -0.8776772 , ...,  0.        ,
        -1.7956102 ,  0.        ],
       [-0.9292432 , -0.2019916 , -0.91630465, ...,  0.        ,
        -1.824682  ,  0.        ],
       [-0.8832913 , -0.77498275, -0.94274163, ...,  0.        ,
        -1.4533677 ,  0.        ]], dtype=float32)

.base_values =
array([-9.267478, -9.267478, -9.267478, ..., -9.267478, -9.267478,
       -9.267478], dtype=float32)

.data =
array([[ 41.17106172,  59.39595677, 250.22362462, ...,   1.        ,
          0.69316258,   0.        ],
       [ 63.07190799,

### 3. Survival Analysis

In [34]:
from lifelines import CoxTimeVaryingFitter

# Prepare survival data
df = df.sort_values(['vm_id', 'timestamp'])
df['start'] = (df['timestamp'] - df['timestamp'].min()).dt.days
df['end'] = df['start'] + 1  # Daily intervals
df['event'] = df['failure']

# CTV model
ctv = CoxTimeVaryingFitter(penalizer=0.1)
ctv.fit(df[['vm_id', 'start', 'end', 'event', 'cpu_util', 'disk_io', 'sys_failures']],
        id_col='vm_id',
        event_col='event',
        start_col='start',
        stop_col='end')

print(ctv.summary)
ctv.plot()
plt.tight_layout()
plt.savefig('survival_plot.png')

                  coef  exp(coef)  se(coef)  coef lower 95%  coef upper 95%  \
covariate                                                                     
cpu_util      0.000629   1.000630  0.000807       -0.000951        0.002210   
disk_io       0.000036   1.000036  0.000081       -0.000123        0.000196   
sys_failures -0.002308   0.997695  0.186746       -0.368324        0.363708   

              exp(coef) lower 95%  exp(coef) upper 95%  cmp to         z  \
covariate                                                                  
cpu_util                 0.999049             1.002213     0.0  0.780311   
disk_io                  0.999877             1.000196     0.0  0.446204   
sys_failures             0.691893             1.438654     0.0 -0.012360   

                     p  -log2(p)  
covariate                         
cpu_util      0.435208  1.200223  
disk_io       0.655450  0.609443  
sys_failures  0.990139  0.014298  



Column sys_failures have very low variance when conditioned on death event present or not. This may harm convergence. This could be a form of 'complete separation'. For example, try the following code:

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'sys_failures'].var())
>>> print(df.loc[~events, 'sys_failures'].var())

A very low variance means that the column sys_failures completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.




In [None]:
# Save artifacts for dashboard
import joblib
import shap
import pandas as pd
import numpy as np
from lifelines import CoxTimeVaryingFitter
import pickle

# 1. Save Model (XGBoost)
joblib.dump(model, 'model.pkl')

# 2. Save Survival Model
with open('survival_model.pkl', 'wb') as file:
    pickle.dump(ctv, file)

# 3. Save SHAP Explainer
# - For Tree-based models
joblib.dump(explainer, 'shap_explainer.joblib')

# - For non-tree models (alternative)
with open('shap_explainer.pkl', 'wb') as f:
    pickle.dump(explainer, f)

# 4. Save Dataset
df.to_csv('vm_telemetry.csv')

# 5. Save Failure History
np.save('failure_history.npy', failure_history, allow_pickle=True)
# Alternative: joblib.dump(failure_history, 'failure_history.joblib')

# 6. Save Feature Names (Critical!)
with open('feature_names.txt', 'w') as f:
    f.write(','.join(X_train.columns.tolist()))