In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
import joblib

# Load dataset
df = pd.read_csv("github_repos_filtered.csv")

# Convert 'created_at' to datetime format
df['created_at'] = pd.to_datetime(df['created_at'], format='mixed', errors='coerce')

# Drop unnecessary columns
df.drop(columns=['id', 'name', 'full_name', 'html_url', 'created_at'], inplace=True)

# Encode 'language'
language_encoder = LabelEncoder()
df['language'] = language_encoder.fit_transform(df['language'].astype(str))

# Process 'description' using TF-IDF (Ensure 523 fixed features)
vectorizer = TfidfVectorizer(max_features=523)
description_tfidf = vectorizer.fit_transform(df['description'].fillna(''))

# Convert TF-IDF matrix to DataFrame and merge with main data
description_df = pd.DataFrame(description_tfidf.toarray(), columns=[f'tfidf_{i}' for i in range(523)])
df = pd.concat([df.drop(columns=['description']), description_df], axis=1)

# Define features (X) and target labels (y)
X = df.drop(columns=['AWS', 'Azure', 'GCP', 'Docker', 'Kubernetes', 'Terraform', 'DevOps'])
y = df[['AWS', 'Azure', 'GCP', 'Docker', 'Kubernetes', 'Terraform', 'DevOps']]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost multi-label classifier
xgb_model = MultiOutputClassifier(xgb.XGBClassifier(eval_metric='logloss'))
xgb_model.fit(X_train, y_train)

# Save trained models and encoders
joblib.dump(xgb_model, "xgboost_model.pkl")
joblib.dump(language_encoder, "language_encoder.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

# Evaluate model performance
y_pred = xgb_model.predict(X_test)

# Classification report with zero division handling
print("Model training complete. Saved as 'xgboost_model.pkl'")
print("Accuracy Score:", xgb_model.score(X_test, y_test))
print("F1 Score (Micro):", f1_score(y_test, y_pred, average='micro'))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=y.columns, zero_division=1))

✅ Model training complete. Saved as 'xgboost_model.pkl'
Accuracy Score: 0.9944116733933561
F1 Score (Micro): 0.9971409969711552
Classification Report:
              precision    recall  f1-score   support

         AWS       1.00      1.00      1.00      6223
       Azure       1.00      0.99      1.00      3978
         GCP       1.00      1.00      1.00      2174
      Docker       1.00      1.00      1.00      2102
  Kubernetes       1.00      0.99      0.99       767
   Terraform       1.00      1.00      1.00      1433
      DevOps       1.00      0.99      1.00      1026

   micro avg       1.00      0.99      1.00     17703
   macro avg       1.00      0.99      1.00     17703
weighted avg       1.00      0.99      1.00     17703
 samples avg       1.00      1.00      1.00     17703



In [1]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Predict on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate accuracy (multi-label accuracy)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy Score: {accuracy}")

# Evaluate F1 score (use 'micro' for multi-label classification)
f1 = f1_score(y_test, y_pred, average='micro')
print(f"F1 Score (Micro): {f1}")

# You can also use classification report for detailed analysis
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=y.columns))


NameError: name 'xgb_model' is not defined