In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn import metrics
import numpy as np

# Load & preprocess data. Fill the filepaths with the correct path to your Puffer data. 

In [None]:
video_sent_file = "data/puffer/puffer_sent.csv"
video_acked_file = "data/puffer/puffer_acked.csv"

In [None]:
video_sent = pd.read_csv(video_sent_file)
video_acked = pd.read_csv(video_acked_file)

In [None]:
# merge video_sent & video_acked on session_id, index, & video_ts
merged_data = pd.merge(video_sent, video_acked, on=['session_id', 'index', 'video_ts'], suffixes=('_sent', '_acked'))
# debugging: time_sent & time_acked not in merged data
print("Columns in merged_data:", merged_data.columns)

In [None]:
len(merged_data)

In [None]:
# rename columns for clarity
merged_data.rename(columns={
    'time (ns GMT)_sent': 'time_sent',
    'time (ns GMT)_acked': 'time_acked'
}, inplace=True)

### Feature engineering

In [None]:
# convert timestamps to datetime (easier to handle)
merged_data['time_sent'] = pd.to_datetime(merged_data['time_sent'], unit='ns')
merged_data['time_acked'] = pd.to_datetime(merged_data['time_acked'], unit='ns')

In [None]:
# download duration in seconds
merged_data['download_duration'] = (merged_data['time_acked'] - merged_data['time_sent']).dt.total_seconds()
#throughput in Mbps
merged_data['throughput'] = ((merged_data['size'] * 8) / merged_data['download_duration']) / 1000000
#byres per transmission time
merged_data['bytes_per_transmission_time'] = merged_data['size'] / merged_data['rtt']

In [None]:
merged_data.columns

In [None]:
merged_data.head()

In [None]:
print(merged_data['size'].mean())
print(merged_data['size'].std())

In [None]:
# select features
features = ['size', 'rtt', 'throughput', 'in_flight', 'bytes_per_transmission_time']
target = 'download_duration'

In [None]:
#Try standardizing the data and see how that affects it? 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
merged_data[features] = scaler.fit_transform(merged_data[['size', 'rtt', 'throughput', 'in_flight', 'bytes_per_transmission_time']])

In [None]:
# filter rows w missing vals in relevant features
merged_data = merged_data.dropna(subset=features + [target])

In [None]:
merged_data.head()

### Model training & evaluation

In [None]:
# edit: training taking a long time - using a subset of data: 
subset_fraction = 0.2
merged_data = merged_data.sample(frac=subset_fraction, random_state=42)

# split dataset
X = merged_data[features]
y = merged_data[target]

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# model training
rf_model = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1) # added n_jobs=-1 to use all CPU cores
rf_model.fit(X_train, y_train)

# evaluate model performance
y_pred_train = rf_model.predict(X_train)
y_pred_test = rf_model.predict(X_test)

In [None]:
# calc metrics
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae = mean_absolute_error(y_test, y_pred_test)
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"Training MAE: {train_mae}, RMSE: {train_rmse}")
print(f"Testing MAE: {test_mae}, RMSE: {test_rmse}")

In [None]:
# feature importance
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(feature_importances)

# cross-validation
cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='neg_mean_absolute_error',  n_jobs=-1)
print(f"Cross-Validation MAE: {-np.mean(cv_scores)}")

# debugging: size of dataset & features
print(f"Dataset size (used for training and testing): {merged_data.shape}")

In [None]:
self_collected_data = pd.read_csv("/path/to/your_twitch_data.csv")

In [None]:
self_collected_data.rename(columns={'dropped_packets': 'in_flight', 'total_tcp_len' : 'size'}, inplace=True)
self_collected_data.head()

In [None]:
test_features = self_collected_data[features]
test_target = self_collected_data[target]

In [None]:
# Check for Inf or NaN values in the entire dataset
print(test_features.isna().sum())  # Check for NaN values
print((test_features == float('inf')).sum())  # Check for positive infinity
print((test_features == float('-inf')).sum())  # Check for negative infinity

# Optionally, check if any value is extremely large or too small for the float32 dtype
# Check for values exceeding a threshold (e.g., 1e10 or -1e10)
print((test_features > 1e10).sum())  # Check for values larger than 1e10
print((test_features < -1e10).sum())  # Check for values smaller than -1e10

# Find rows with large values in 'bytes_per_transmission_time'
large_values = test_features[test_features['bytes_per_transmission_time'] > 1e10]  # Adjust threshold as needed
print(large_values)

# Find rows with small values in 'bytes_per_transmission_time'
small_values = test_features[test_features['bytes_per_transmission_time'] < -1e10]  # Adjust threshold as needed
print(small_values)


In [None]:
# Calculate the median of the column excluding Inf
median_value = test_features['bytes_per_transmission_time'].replace(float('inf'), np.nan).median()

# Replace Inf values with the median
test_features['bytes_per_transmission_time'] = test_features['bytes_per_transmission_time'].replace(float('inf'), median_value)

In [None]:
test_pred_new = rf_model.predict(test_features)

In [None]:
my_data_mae = mean_absolute_error(test_target, test_pred_new)
my_data_rmse = np.sqrt(mean_squared_error(test_target, test_pred_new))

print(f"My Data MAE: {my_data_mae}, RMSE: {my_data_rmse}")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from trustee import RegressionTrustee
from sklearn.tree import plot_tree

trustee = RegressionTrustee(expert=rf_model)
trustee.fit(X_train, y_train)
_, dt, _, score = trustee.explain()
print(f"Training score of pruned DT: {score}")
dt_y_pred = dt.predict(X_train)

# plot a tree
fig = plt.figure(figsize=(25,20))
plot_tree(dt, feature_names=X_train.columns, filled=True, max_depth=7)