In [41]:
import joblib
import pandas as pd
from datetime import datetime

In [42]:
# 1. Load the saved model bundle
bundle = joblib.load('github_stars_predictor_bundle.pkl')
model = bundle['model']
scaler = bundle['scaler']
encoders = bundle['encoders']
feature_names = bundle['feature_names']

In [43]:
# 2. Create a real-world sample repository
sample_repo = {
    'name': 'ml-web-app',
    'full_name': 'data-scientist/ml-web-app',
    'created_at': '2023-05-10T08:00:00Z',
    'updated_at': '2023-11-15T14:25:00Z',
    'pushed_at': '2023-11-15T14:30:00Z',
    'language': 'Python',  # Must be in encoders['language'].classes_
    'license': 'mit',      # Must be in encoders['license'].classes_
    'forks': 87,
    'watchers': 420,
    'open_issues': 12,
    'size': 3500,
    'has_wiki': True,
    'has_projects': False,
    'has_downloads': True,
    'is_fork': False,
    'archived': False,
    'subscribers_count': 150,
    'readme_size': 1024,
    'commits_count': 85,
    'contributors_count': 12
}

In [44]:
# 3. Preprocessing function for new data
def preprocess_new_repo(repo_data, bundle):
    """Transform raw repo data into model-ready format"""
    # Convert to DataFrame
    repo_df = pd.DataFrame([repo_data])
    
    # Process dates
    for col in ['created_at', 'updated_at', 'pushed_at']:
        repo_df[col] = pd.to_datetime(repo_df[col], utc=True).dt.tz_localize(None)
        repo_df[f'{col}_days'] = (repo_df[col] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1d')
    
    # Encode categoricals
    for col in ['language', 'license']:
        try:
            repo_df[col] = bundle['encoders'][col].transform([repo_data[col]])[0]
        except ValueError as e:
            print(f"Error: {str(e)}")
            print(f"Allowed {col}s: {list(bundle['encoders'][col].classes_)}")
            return None
    
    # Select and order features exactly like training
    processed = repo_df[bundle['feature_names']]
    
    # Fill NA and scale
    processed = processed.fillna(0)
    return scaler.transform(processed)


In [45]:
# 4. Make and show prediction
processed_data = preprocess_new_repo(sample_repo, bundle)

if processed_data is not None:
    prediction = model.predict(processed_data)[0]
    
    print("\n=== GitHub Stars Prediction ===")
    print(f"Repository: {sample_repo['full_name']}")
    print(f"Language: {sample_repo['language']}")
    print(f"License: {sample_repo['license']}")
    print(f"Created: {sample_repo['created_at']}")
    print(f"\nPredicted Stars: {round(prediction)}")
    
    # Show confidence (for regression models)
    if hasattr(model, 'predict_proba'):
        proba = model.predict_proba(processed_data)[0]
        print(f"Confidence: {max(proba)*100:.1f}%")
    
    # Show available categories
    print("\nModel was trained with:")
    print(f"Languages: {list(encoders['language'].classes_)}")
    print(f"Licenses: {list(encoders['license'].classes_)}")
    print(f"Features ({len(feature_names)}): {feature_names}")
else:
    print("Prediction failed due to preprocessing error")


=== GitHub Stars Prediction ===
Repository: data-scientist/ml-web-app
Language: Python
License: mit
Created: 2023-05-10T08:00:00Z

Predicted Stars: 420

Model was trained with:
Languages: ['Assembly', 'Astro', 'Batchfile', 'Blade', 'C', 'C#', 'C++', 'CSS', 'Clojure', 'Cuda', 'Dart', 'Dockerfile', 'Elixir', 'Go', 'HTML', 'Haskell', 'Java', 'JavaScript', 'Jinja', 'Julia', 'Jupyter Notebook', 'Kotlin', 'LLVM', 'Lua', 'MDX', 'Makefile', 'Markdown', 'Nunjucks', 'Objective-C', 'PHP', 'PowerShell', 'Python', 'Roff', 'Ruby', 'Rust', 'SCSS', 'Scala', 'Shell', 'Svelte', 'Swift', 'TeX', 'TypeScript', 'V', 'Vim Script', 'Vue', 'Zig', 'nan']
Licenses: ['agpl-3.0', 'apache-2.0', 'bsd-2-clause', 'bsd-3-clause', 'cc-by-4.0', 'cc-by-sa-4.0', 'cc0-1.0', 'gpl-2.0', 'gpl-3.0', 'isc', 'lgpl-3.0', 'mit', 'mpl-2.0', 'nan', 'ofl-1.1', 'other', 'unlicense', 'vim', 'wtfpl', 'zlib']
Features (18): ['forks', 'watchers', 'open_issues', 'size', 'has_wiki', 'has_projects', 'has_downloads', 'is_fork', 'archived', 'l