In [None]:
import pandas as pd
from prophet import Prophet
import time
import numpy as np
from sklearn.metrics import mean_absolute_error

# Load and preprocess the dataset
df = pd.read_csv("github_repos_filtered.csv")
df['created_at'] = pd.to_datetime(df['created_at'], format='mixed')
full_size = len(df)
print(f"Full dataset size: {full_size} rows")

# Function to measure Prophet model performance
def measure_performance(df, col_name="is_cloud_project", label="Overall Cloud Projects", periods=365):
    start_time = time.time()
    df_trend = df[df[col_name] == 1].groupby('created_at').size().reset_index(name='y')
    df_trend.rename(columns={'created_at': 'ds'}, inplace=True)
    if df_trend.empty:
        print(f"Skipping {label} - No data available.")
        return None
    
    # Tuned Prophet model
    model = Prophet(
        changepoint_prior_scale=0.1,  
        yearly_seasonality=True,      
        weekly_seasonality=True       
    )
    model.fit(df_trend)
    future = model.make_future_dataframe(periods=periods)
    forecast = model.predict(future)
    end_time = time.time()
    
    latency = end_time - start_time
    throughput = len(forecast) / latency
    dataset_size = len(df)
    historical_forecast = forecast[forecast['ds'].isin(df_trend['ds'])]['yhat']
    mae = mean_absolute_error(df_trend['y'], historical_forecast)
    
    return {
        "label": label,
        "latency": latency,
        "throughput": throughput,
        "dataset_size": dataset_size,
        "mae": mae
    }
# Define dataset sizes for performance testing
sizes = [100, 1000, 10000, 25000, 50000, full_size]
results = []
# Test performance across different dataset sizes
for size in sizes:
    print(f"\nTesting with {size} repositories:")
    if size <= full_size:
        sampled_df = df.sample(n=size, random_state=42)
    else:
        repeat_factor = (size // full_size) + 1
        sampled_df = pd.concat([df] * repeat_factor, ignore_index=True).iloc[:size]
        sampled_df['created_at'] = pd.to_datetime(sampled_df['created_at'], format='mixed')
    result = measure_performance(sampled_df, "is_cloud_project", f"Overall Cloud Projects ({size} repos)")
    if result:
        results.append(result)
# display results
results_df = pd.DataFrame(results)
results_df['latency'] = results_df['latency'].round(2)
results_df['throughput'] = results_df['throughput'].round(2)
results_df['mae'] = results_df['mae'].round(2)
print("\nPerformance Results:")
print(results_df.to_string(index=False))
results_df.to_csv("prophet_performance_metrics.csv", index=False)
print("\nSaved performance metrics to 'prophet_performance_metrics.csv'")

18:43:45 - cmdstanpy - INFO - Chain [1] start processing


Full dataset size: 80525 rows

Testing with 100 repositories:


18:43:46 - cmdstanpy - INFO - Chain [1] done processing
18:43:46 - cmdstanpy - INFO - Chain [1] start processing
18:43:46 - cmdstanpy - INFO - Chain [1] done processing



Testing with 1000 repositories:


18:43:46 - cmdstanpy - INFO - Chain [1] start processing
18:43:46 - cmdstanpy - INFO - Chain [1] done processing



Testing with 10000 repositories:


18:43:47 - cmdstanpy - INFO - Chain [1] start processing



Testing with 25000 repositories:


18:43:47 - cmdstanpy - INFO - Chain [1] done processing
18:43:47 - cmdstanpy - INFO - Chain [1] start processing
18:43:47 - cmdstanpy - INFO - Chain [1] done processing



Testing with 50000 repositories:


18:43:47 - cmdstanpy - INFO - Chain [1] start processing



Testing with 80525 repositories:


18:43:48 - cmdstanpy - INFO - Chain [1] done processing



Performance Results:
                               label  latency  throughput  dataset_size   mae
  Overall Cloud Projects (100 repos)     0.49      930.02           100  0.17
 Overall Cloud Projects (1000 repos)     0.28     3137.45          1000  0.78
Overall Cloud Projects (10000 repos)     0.37     2983.38         10000  2.76
Overall Cloud Projects (25000 repos)     0.40     2710.22         25000  5.19
Overall Cloud Projects (50000 repos)     0.40     2734.28         50000  8.15
Overall Cloud Projects (80525 repos)     0.44     2477.48         80525 11.41

Saved performance metrics to 'prophet_performance_metrics.csv'
