In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from sklearn.metrics import accuracy_score

# Load dataset and models
df = pd.read_csv("github_repos_filtered.csv")
xgb_model = joblib.load("xgboost_model.pkl")
language_encoder = joblib.load("language_encoder.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")
full_size = len(df)
print(f"Full dataset size: {full_size} rows")

# Define inference function with performance metrics
def measure_performance(df_subset, label):
    start_time = time.time()
    
    # Preprocess features
    descriptions = df_subset['description'].fillna('')
    description_tfidf = vectorizer.transform(descriptions)
    features = pd.DataFrame(description_tfidf.toarray(), columns=[f'tfidf_{i}' for i in range(523)])
    
    languages = ['JavaScript', 'Python', 'TypeScript', 'Jupyter Notebook', 'Java', 'C#', 'Go', 'PHP', 'C++', 'Vue', 'Bicep', 'Kotlin', 'Dart', 'Rust', 'C', 'Ruby']
    for lang in languages:
        features[lang] = (df_subset['language'] == lang).astype(int)
    
    features['year'] = 2025
    features['month'] = 3
    features['day'] = 22
    features['week'] = 12
    features['size'] = df_subset['size']
    features['is_cloud_project'] = df_subset['description'].str.lower().str.contains('cloud').fillna(False).astype(int)
    features['language'] = df_subset['language'].apply(lambda x: language_encoder.transform([x])[0] if x in language_encoder.classes_ else -1)
    
    expected_columns = ['size', 'language'] + languages + ['year', 'month', 'day', 'week', 'is_cloud_project'] + [f'tfidf_{i}' for i in range(523)]
    features = features[expected_columns]
    
    # Predict
    predictions = xgb_model.predict(features)
    if predictions.ndim == 2:
        predictions = (predictions > 0.5).astype(int)
    
    end_time = time.time()
    
    # Calculate metrics
    latency = end_time - start_time
    throughput = len(df_subset) / latency
    dataset_size = len(df_subset)
    
    # Hamming Score (fraction of correct labels)
    true_labels = df_subset[['AWS', 'Azure', 'GCP', 'Docker', 'Kubernetes', 'Terraform', 'DevOps']].values
    hamming_score = accuracy_score(true_labels, predictions, normalize=True)
    
    return {
        "label": label,
        "latency": latency,
        "throughput": throughput,
        "dataset_size": dataset_size,
        "hamming_score": hamming_score
    }

# Test different sizes
sizes = [100, 1000, 10000, 25000, 50000, full_size]
results = []

for size in sizes:
    print(f"\nTesting with {size} repositories:")
    if size <= full_size:
        sampled_df = df.sample(n=size, random_state=42)
    else:
        repeat_factor = (size // full_size) + 1
        sampled_df = pd.concat([df] * repeat_factor, ignore_index=True).iloc[:size]
    
    result = measure_performance(sampled_df, f"XGBoost Inference ({size} repos)")
    if result:
        results.append(result)

# Output results
results_df = pd.DataFrame(results)
results_df['latency'] = results_df['latency'].round(2)
results_df['throughput'] = results_df['throughput'].round(2)
results_df['hamming_score'] = results_df['hamming_score'].round(2)
print("\nPerformance Results:")
print(results_df.to_string(index=False))
results_df.to_csv("xgboost_performance_metrics.csv", index=False)
print("\nSaved performance metrics to 'xgboost_performance_metrics.csv'")

Full dataset size: 80525 rows

Testing with 100 repositories:

Testing with 1000 repositories:

Testing with 10000 repositories:

Testing with 25000 repositories:

Testing with 50000 repositories:

Testing with 80525 repositories:

Performance Results:
                          label  latency  throughput  dataset_size  hamming_score
  XGBoost Inference (100 repos)     0.79      126.07           100           1.00
 XGBoost Inference (1000 repos)     1.26      796.42          1000           0.99
XGBoost Inference (10000 repos)     4.58     2182.88         10000           0.99
XGBoost Inference (25000 repos)     9.88     2531.51         25000           0.99
XGBoost Inference (50000 repos)    19.65     2544.24         50000           0.99
XGBoost Inference (80525 repos)    29.46     2733.61         80525           0.99

Saved performance metrics to 'xgboost_performance_metrics.csv'
