In [1]:
import joblib
import pandas as pd
import numpy as np
from datetime import datetime

In [7]:
# Load the saved model bundle
pipeline = joblib.load("best_model.pkl")
# preprocessor = pipeline['preprocessor']

In [8]:
pipeline

In [9]:
def transform(df):
    # Parse and normalize time-related features
    date = datetime(2025, 5, 1)
    df["created_at"] = pd.to_datetime(df["created_at"]).dt.tz_localize(None)
    df["updated_at"] = pd.to_datetime(df["updated_at"]).dt.tz_localize(None)
    df["pushed_at"] = pd.to_datetime(df["pushed_at"]).dt.tz_localize(None)
    df["project_age"] = (date - df["created_at"]).dt.days
    df["days_since_update"] = (date - df["updated_at"]).dt.days
    df["days_since_push"] = (date - df["pushed_at"]).dt.days

    # Handle missing values
    df["license"] = df["license"].fillna("None")
    df["language"] = df["language"].fillna("Unknown")

    # Derived rate-based features
    df["forks_per_day"] = df["forks"] / (df["project_age"] + 1)
    df["issues_per_day"] = df["open_issues"] / (df["project_age"] + 1)
    df["update_rate"] = 1 / (1 + df["days_since_update"])

    # Replace inf with NaN and drop rows with NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)

    # Selected features
    features = [
        'forks', 'watchers', 'open_issues',
        'size', 'has_wiki', 'has_projects', 'has_downloads', 'is_fork',
        'archived', 'language', 'license', 'subscribers_count', 
        'contributors_count', 'commits_count', 'readme_size',
        'project_age', 'days_since_update', 'days_since_push',
        'forks_per_day', 'issues_per_day', 'update_rate'
    ]
    return df[features]

In [None]:
# Create a real-world sample repository
sample_repo = {
    'name': 'ml-web-app',
    'full_name': 'data-scientist/ml-web-app',
    'created_at': '2023-05-10T08:00:00Z',
    'updated_at': '2023-11-15T14:25:00Z',
    'pushed_at': '2023-11-15T14:30:00Z',
    'language': 'Python',  # Must be in encoders['language'].classes_
    'license': 'mit',      # Must be in encoders['license'].classes_
    'forks': 87,
    'watchers': 420, # same as stars, unknown
    'open_issues': 12,
    'size': 3500,
    'has_wiki': True,
    'has_projects': False,
    'has_downloads': True,
    'is_fork': False,
    'archived': False,
    'subscribers_count': 150,
    'readme_size': 1024,
    'commits_count': 85,
    'contributors_count': 12
}
sample_repo_df = pd.DataFrame([sample_repo])


In [11]:
# Make and show prediction
X_sample = transform(sample_repo_df)

if X_sample is not None:
    y_pred = pipeline.predict(X_sample)[0]
    model = pipeline['regressor']
    print("\n=== GitHub Stars Prediction ===")
    print(f"Repository: {sample_repo['full_name']}")
    print(f"Language: {sample_repo['language']}")
    print(f"License: {sample_repo['license']}")
    print(f"Created: {sample_repo['created_at']}")
    print(f"\nPredicted Stars: {round(y_pred)}")
    
    # Show confidence (for regression models)
    if hasattr(model, 'predict_proba'):
        proba = model.predict_proba(X_sample)[0]
        print(f"Confidence: {max(proba)*100:.1f}%")
    
    # Show available categories
    print("\nModel was trained with:")
    # print(f"Languages: {list(encoders['language'].classes_)}")
    # print(f"Licenses: {list(encoders['license'].classes_)}")
    print(f"Features ({len(X_sample.columns)}): {X_sample.columns}")
else:
    print("Prediction failed due to preprocessing error")


=== GitHub Stars Prediction ===
Repository: data-scientist/ml-web-app
Language: Python
License: mit
Created: 2023-05-10T08:00:00Z

Predicted Stars: 3096812

Model was trained with:
Features (21): Index(['forks', 'watchers', 'open_issues', 'size', 'has_wiki', 'has_projects',
       'has_downloads', 'is_fork', 'archived', 'language', 'license',
       'subscribers_count', 'contributors_count', 'commits_count',
       'readme_size', 'project_age', 'days_since_update', 'days_since_push',
       'forks_per_day', 'issues_per_day', 'update_rate'],
      dtype='object')
