# Project Title: Proactive Container Scaling Prediction

## By: Ashley G

## 1. Introduction
Your project title, names, and date.

## 2. Setup and Data Import

In [61]:
# The random seed
random_seed = 42


import numpy as np
np.random.seed(random_seed)
import pandas as pd
from wooldridge import dataWoo
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
import sklearn.datasets
from sklearn.inspection import DecisionBoundaryDisplay

### 2.1. Load and Inspect Datasets

In [62]:
df_perf=pd.read_csv('kubernetes_performance_metrics_dataset.csv')
df_resource=pd.read_csv('kubernetes_resource_allocation_dataset.csv')

df_perf['timestamp'] = pd.to_datetime(df_perf['timestamp'], format='%m/%d/%Y %H:%M')

In [63]:
df_perf.columns.tolist()

['timestamp',
 'pod_name',
 'namespace',
 'cpu_allocation_efficiency',
 'memory_allocation_efficiency',
 'disk_io',
 'network_latency',
 'node_temperature',
 'node_cpu_usage',
 'node_memory_usage',
 'event_type',
 'event_message',
 'scaling_event',
 'pod_lifetime_seconds']

In [64]:
df_resource.columns.tolist()

['pod_name',
 'namespace',
 'cpu_request',
 'cpu_limit',
 'memory_request',
 'memory_limit',
 'cpu_usage',
 'memory_usage',
 'node_name',
 'pod_status',
 'restart_count',
 'uptime_seconds',
 'deployment_strategy',
 'scaling_policy',
 'network_bandwidth_usage']

### 2.2 Data Joining

In [None]:
# Select key configuration columns from the resource allocation data
resource_cols_to_keep = [
    'pod_name', 'memory_usage', 'cpu_limit', 'memory_limit', 'cpu_usage',
    'deployment_strategy', 'scaling_policy', 'cpu_request', 'memory_request'
]

# Merge the datasets using a left join
df_raw_merged = pd.merge(
    df_perf,
    df_resource[resource_cols_to_keep],
    on=['pod_name'],
    how='left'
)


print(f"Total rows after merge: {len(df_raw_merged)}")
print(f"Columns after merge: {df_raw_merged.columns.tolist()}")


Total rows after merge: 15000
Columns after merge: ['timestamp', 'pod_name', 'namespace', 'cpu_allocation_efficiency', 'memory_allocation_efficiency', 'disk_io', 'network_latency', 'node_temperature', 'node_cpu_usage', 'node_memory_usage', 'event_type', 'event_message', 'scaling_event', 'pod_lifetime_seconds', 'memory_usage', 'cpu_limit', 'memory_limit', 'cpu_usage', 'deployment_strategy', 'scaling_policy', 'cpu_request', 'memory_request']

SUCCESS: The critical 'cpu_usage' column is present.


## 3. Data Cleaning and Preprocessing

### 3.1. Identify and Handle Missing Data

In [66]:
df_cleaned = df_raw_merged.copy()

In [67]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 22 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   timestamp                     15000 non-null  datetime64[ns]
 1   pod_name                      15000 non-null  object        
 2   namespace                     15000 non-null  object        
 3   cpu_allocation_efficiency     15000 non-null  float64       
 4   memory_allocation_efficiency  15000 non-null  float64       
 5   disk_io                       15000 non-null  float64       
 6   network_latency               15000 non-null  float64       
 7   node_temperature              15000 non-null  float64       
 8   node_cpu_usage                15000 non-null  float64       
 9   node_memory_usage             15000 non-null  float64       
 10  event_type                    15000 non-null  object        
 11  event_message               

In [68]:
df_cleaned.describe()

Unnamed: 0,timestamp,cpu_allocation_efficiency,memory_allocation_efficiency,disk_io,network_latency,node_temperature,node_cpu_usage,node_memory_usage,pod_lifetime_seconds,memory_usage,cpu_limit,memory_limit,cpu_usage,cpu_request,memory_request
count,15000,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
mean,2023-01-01 02:04:30,0.500843,0.501448,499.580661,99.81181,60.087796,49.39082,50.163728,100122.210267,4123.388369,2.250631,4234.132709,1.99421,1.052096,2110.281025
min,2023-01-01 00:00:00,0.000165,7.8e-05,0.090925,0.000564,20.000494,0.022562,0.009563,6.0,0.012189,0.500025,256.212641,0.000521,0.100021,128.119738
25%,2023-01-01 01:02:00,0.252717,0.249897,250.814566,49.599228,40.059461,24.44818,25.184379,50425.75,2099.208569,1.378411,2281.175714,0.989391,0.575561,1127.360048
50%,2023-01-01 02:04:30,0.501555,0.499775,497.133849,99.046093,60.000056,49.266834,50.032371,99422.5,4120.838924,2.251391,4212.613405,1.998167,1.055143,2097.534763
75%,2023-01-01 03:07:00,0.747617,0.75285,747.53538,150.549303,80.126939,73.929835,75.417713,149846.25,6189.240921,3.125031,6211.191332,2.988054,1.527721,3098.478331
max,2023-01-01 04:09:00,0.999957,0.99996,999.919897,199.998813,99.981225,99.996519,99.999938,199968.0,8191.874733,3.9996,8191.858882,3.999931,1.999917,4095.689412
std,,0.288213,0.28983,287.769516,57.929142,23.112204,28.774625,28.940546,57502.260644,2365.471215,1.010891,2282.287735,1.15511,0.548135,1144.993437


In [69]:
critical_cols = ['cpu_limit', 'memory_limit', 'cpu_request', 'memory_request', 
    'cpu_usage', 'memory_usage', 
    'node_cpu_usage', 'node_memory_usage', 'disk_io', 'network_latency', 
    'node_temperature', 'pod_lifetime_seconds']

print("Missing values before cleaning (in critical numeric columns):")
print(df_cleaned[critical_cols].isnull().sum())

rows_before = len(df_cleaned)
df_cleaned.dropna(subset=critical_cols, inplace=True)

rows_dropped = rows_before - len(df_cleaned)
print(f"\n{rows_dropped} rows dropped due to missing cpu_limit or memory_limit.")

Missing values before cleaning (in critical numeric columns):
cpu_limit               0
memory_limit            0
cpu_request             0
memory_request          0
cpu_usage               0
memory_usage            0
node_cpu_usage          0
node_memory_usage       0
disk_io                 0
network_latency         0
node_temperature        0
pod_lifetime_seconds    0
dtype: int64

0 rows dropped due to missing cpu_limit or memory_limit.


In [70]:
numeric_cols = df_cleaned.select_dtypes(include=np.number).columns
df_cleaned.loc[:, numeric_cols] = df_cleaned.loc[:, numeric_cols].clip(lower=0)

print(f"Rows remaining after cleanup: {len(df_cleaned)}")

Rows remaining after cleanup: 15000


### 3.2 Identifying and Handling Duplicates 

In [71]:
df_cleaned.duplicated().sum()

np.int64(0)

In [72]:
physical_metrics = ['disk_io', 'network_latency', 'node_temperature', 
                    'node_cpu_usage', 'node_memory_usage']

# Cap any negative values at 0 for these columns
df_cleaned.loc[:, physical_metrics] = df_cleaned[physical_metrics].clip(lower=0)

### 3.4 Binary Classification- Target Defined

In [None]:
critical_messages = ['Killed', 'OOMKilled']

# 1 if the event message is one of the critical failures, 0 otherwise.
target = df_cleaned['event_message'].isin(critical_messages).astype(int)

# Adding the target back to the DataFrame for alignment and viewing
df_cleaned.loc[:,'Resource_Overload_Flag'] = target

In [74]:
target.value_counts()

event_message
0    9117
1    5883
Name: count, dtype: int64

## 4. Exploratory Data Analysis (EDA)

## 5. Feature Engineering 

In [75]:
df_final= df_cleaned.copy() # Starting a new, final DataFrame for modeling prep


In [76]:
print(df_final.columns.tolist())

['timestamp', 'pod_name', 'namespace', 'cpu_allocation_efficiency', 'memory_allocation_efficiency', 'disk_io', 'network_latency', 'node_temperature', 'node_cpu_usage', 'node_memory_usage', 'event_type', 'event_message', 'scaling_event', 'pod_lifetime_seconds', 'memory_usage', 'cpu_limit', 'memory_limit', 'cpu_usage', 'deployment_strategy', 'scaling_policy', 'cpu_request', 'memory_request', 'Resource_Overload_Flag']


### 5.1 Feature Engineering

In [77]:
df_final.loc[:, 'cpu_utilization_ratio'] = df_final['cpu_usage'] / df_final['cpu_limit']
df_final.loc[:, 'memory_utilization_ratio'] = df_final['memory_usage'] / df_final['memory_limit']


### 5.2 Outlier Treatment (Clipping the key ratio)

In [78]:
# Cap the extreme outliers in the utilization ratio at the 99th percentile 
# to stabilize linear models and manage high leverage points.
cpu_ratio_cap = df_final['cpu_utilization_ratio'].quantile(0.99)

df_final.loc[:, 'cpu_utilization_ratio'] = df_final['cpu_utilization_ratio'].clip(upper=cpu_ratio_cap)

### 5.3 One-Hot Encoding

In [79]:
categorical_features = ['namespace', 'deployment_strategy', 'scaling_policy']
df_final = pd.get_dummies(df_final, columns=categorical_features, drop_first=True, dtype=int)

### 5.4 Final Feature (X) and Target (y) Split

In [80]:
y = target

features_to_keep = [
    'cpu_limit', 'memory_limit', 'cpu_request', 'memory_request',
    'cpu_utilization_ratio', 'memory_utilization_ratio', 
    'cpu_allocation_efficiency', # <--- The critical addition
    'memory_allocation_efficiency', # <--- The critical addition
    'node_cpu_usage', 'node_memory_usage', 'disk_io', 'network_latency', 
    'node_temperature', 'pod_lifetime_seconds'
]


X = df_final[features_to_keep + list(df_final.filter(regex='_policy|_strategy|namespace_').columns)]

print(f"Final number of observations (rows): {X.shape[0]}")
print(f"Final number of features (X columns): {X.shape[1]}")


Final number of observations (rows): 15000
Final number of features (X columns): 19


## 6. Model Building and Comparison

### 6.1 Train-Test Split

In [81]:
from sklearn.model_selection import train_test_split

In [82]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.3, 
    random_state=random_seed, 
    stratify=y
)

print(f"Train set size: {X_train.shape[0]} rows")
print(f"Test set size: {X_test.shape[0]} rows")

Train set size: 10500 rows
Test set size: 4500 rows


### 6.2 Scaling

In [83]:
# Initialize and Apply the StandardScaler
ss = StandardScaler()

 # Fit only on the training data
X_train_scaled = ss.fit_transform(X_train)
# Transform both train and test data
X_test_scaled = ss.transform(X_test)

### 6.3 Model Training and Prediction

In [86]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier

# Initialize Models
models = {
    "lr": LogisticRegression(random_state=random_seed, solver='liblinear', class_weight='balanced'),
    "dtc": DecisionTreeClassifier(random_state=random_seed),
    "rfc": RandomForestClassifier(random_state=random_seed),
  }

# Training and Prediction Loop
results = {}
for name, model in models.items():
    if name == 'lr':
        # Use scaled data for Logistic Regression
        model.fit(X_train_scaled, y_train) 
        Y_pred = model.predict(X_test_scaled)
        # Use predict_proba for ROC-AUC
        Y_proba = model.predict_proba(X_test_scaled)[:, 1]
    else:
        # Use unscaled data for Tree Models (they are scale-invariant)
        model.fit(X_train, y_train) 
        Y_pred = model.predict(X_test)
        Y_proba = model.predict_proba(X_test)[:, 1]
    
    results[name] = {'Y_pred': Y_pred, 'Y_proba': Y_proba, 'model': model}
    print(f"-> Finished training and predicting with: {name}")

-> Finished training and predicting with: lr
-> Finished training and predicting with: dtc
-> Finished training and predicting with: rfc


### 6.3 Model Evaluation and Comparison

In [None]:
comparison_metrics = []

for name, data in results.items():
    # Classification Report
    report = classification_report(y_test, data['Y_pred'], output_dict=True, zero_division=0)
    
    metrics_1 = report.get('1', {}) # Get '1' key, or an empty dict {} if not present

    precision = metrics_1.get('precision', 0.0)
    recall = metrics_1.get('recall', 0.0)
    f1 = metrics_1.get('f1-score', 0.0)
    
    # Overall Metrics
    accuracy = report['accuracy']
    roc_auc = roc_auc_score(y_test, data['Y_proba'])

    comparison_metrics.append({
        'Model': name,
        'Accuracy': np.round(accuracy, 4),
        'Precision (Class 1)': np.round(precision, 4),
        'Recall (Class 1)': np.round(recall, 4),
        'F1-Score (Class 1)': np.round(f1, 4),
        'ROC-AUC': np.round(roc_auc, 4)
    })

df_metrics = pd.DataFrame(comparison_metrics).sort_values(by='ROC-AUC', ascending=False)

print("\n--- Model Performance Comparison ---")
print(df_metrics.to_markdown(index=False))
# 

# Feature Importance 
best_model_name = df_metrics.iloc[0]['Model']

if best_model_name in ["rfc", "dtc"]:
    best_model = results[best_model_name]['model']
    feature_importances = pd.Series(best_model.feature_importances_, index=X.columns).sort_values(ascending=False).head(10)
    print(f"\nTop 10 Feature Importances for {best_model_name}:")
    print(feature_importances.to_markdown())
    #


--- Model Performance Comparison ---
| Model   |   Accuracy |   Precision (Class 1) |   Recall (Class 1) |   F1-Score (Class 1) |   ROC-AUC |
|:--------|-----------:|----------------------:|-------------------:|---------------------:|----------:|
| rfc     |     0.5949 |                0.3942 |             0.0612 |               0.1059 |    0.5039 |
| dtc     |     0.524  |                0.3965 |             0.4091 |               0.4027 |    0.5036 |
| lr      |     0.5011 |                0.3919 |             0.4929 |               0.4366 |    0.4997 |

Top 10 Feature Importances for rfc:
|                              |         0 |
|:-----------------------------|----------:|
| memory_allocation_efficiency | 0.0697751 |
| network_latency              | 0.0697731 |
| cpu_allocation_efficiency    | 0.0695826 |
| node_memory_usage            | 0.0688667 |
| disk_io                      | 0.0688294 |
| node_cpu_usage               | 0.0687798 |
| pod_lifetime_seconds         | 0.06856

### 6.4 Model Evaluation and Selection
Comparison Table: Present a table comparing all models using Accuracy, Precision, Recall, F1-score, and ROC-AUC. Selection: State which model performs best for your goal (likely optimizing for high Recall to avoid missing a scaling event)

## 7. Results and Recommendations

## 8. Conclusion and Future Work