In [5]:
import requests
import joblib
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import base64
from datetime import datetime
import numpy as np  # Import NumPy

# Load the saved model and encoders
xgb_model = joblib.load("xgboost_model.pkl")
language_encoder = joblib.load("language_encoder.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

# GitHub repository details (owner and repo name)
owner = "e2eSolutionArchitect"
repo_name = "terraform"

# GitHub API URL to fetch repository details
repo_url = f"https://api.github.com/repos/{owner}/{repo_name}"

# Fetch repository details using GitHub API
repo_data = requests.get(repo_url).json()

# Extract relevant details from the repo
repo_title = repo_data['name']
repo_description = repo_data['description'] if repo_data['description'] else ''
repo_language = repo_data['language'] if repo_data['language'] else 'Unknown'
repo_size = repo_data['size']  # Add size feature

# Fetch content from the repository (e.g., README)
readme_url = f"https://api.github.com/repos/{owner}/{repo_name}/contents/README.md"
readme_data = requests.get(readme_url).json()
if readme_data.get('content'):
    content = base64.b64decode(readme_data['content']).decode('utf-8')
else:
    content = ''

# Get the current date for the temporal features
current_date = datetime.now()
year = current_date.year
month = current_date.month
day = current_date.day
week = current_date.isocalendar()[1]  # Week of the year

# Process the description using the TF-IDF vectorizer
description_tfidf = vectorizer.transform([repo_description])

# Prepare the features
features = pd.DataFrame(description_tfidf.toarray(), columns=[f'tfidf_{i}' for i in range(523)])

# Ensure the language encoding
if repo_language not in language_encoder.classes_:
    print(f"Warning: Unseen language '{repo_language}' encountered. Using default encoding.")
    repo_language_encoded = -1  # Placeholder for unseen language
else:
    repo_language_encoded = language_encoder.transform([repo_language])[0]

# Add the language and temporal features
languages = ['JavaScript', 'Python', 'TypeScript', 'Jupyter Notebook', 'Java', 'C#', 'Go', 'PHP', 'C++', 'Vue', 'Bicep', 'Kotlin', 'Dart', 'Rust', 'C', 'Ruby']
for lang in languages:
    features[lang] = 1 if lang == repo_language else 0

# Add the temporal features (year, month, day, week)
features['year'] = year
features['month'] = month
features['day'] = day
features['week'] = week

# Add other features
features['size'] = repo_size
features['is_cloud_project'] = 1 if 'cloud' in repo_description.lower() else 0

# Add the language feature
features['language'] = repo_language_encoded

# Ensure the structure is the same as the training data
# Reorder columns to match the training data
expected_columns = ['size', 'language'] + languages + ['year', 'month', 'day', 'week', 'is_cloud_project'] + [f'tfidf_{i}' for i in range(523)]
features = features[expected_columns]

# Get predictions from the model
try:
    predictions = xgb_model.predict(features)
    
    # If the predictions are probabilities, apply a threshold of 0.5 to classify
    if isinstance(predictions, np.ndarray) and predictions.ndim == 2:
        predictions = (predictions > 0.5).astype(int)
    
    # Map predictions to appropriate tags
    tags = ['AWS', 'Azure', 'GCP', 'Docker', 'Kubernetes', 'Terraform', 'DevOps']
    predicted_tags = {tags[i]: predictions[0][i] for i in range(len(tags))}

    # Output the predicted tags for the repository
    print(f"Predicted tags for repository '{repo_title}':")
    for tag, prediction in predicted_tags.items():
        if prediction == 1:
            print(f"- {tag}")
except Exception as e:
    print(f"Error during prediction: {e}")


Predicted tags for repository 'terraform':
- AWS
- Azure
- GCP
- Terraform
