# ML Scoring Model Training

This notebook trains an XGBoost model using SERP ranking data to predict content quality scores.

## Setup
1. Upload `training_data.csv` to Colab
2. Run cells sequentially

In [None]:
# Install dependencies
!pip install -q xgboost scikit-learn pandas numpy shap matplotlib seaborn

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import json
import shap
import matplotlib.pyplot as plt
import seaborn as sns

print("Dependencies loaded successfully")

In [None]:
# Load training data
df = pd.read_csv('training_data.csv')
print(f"Loaded {len(df)} records")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Data exploration
print(f"Target score distribution:")
print(df['target_score'].describe())
print(f"\nMissing values:")
print(df.isnull().sum())

# Visualize target distribution
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.hist(df['target_score'], bins=10, edgecolor='black')
plt.xlabel('Target Score')
plt.ylabel('Frequency')
plt.title('Target Score Distribution')

plt.subplot(1, 2, 2)
plt.scatter(df['serp_rank'], df['target_score'])
plt.xlabel('SERP Rank')
plt.ylabel('Target Score')
plt.title('SERP Rank vs Target Score')
plt.tight_layout()
plt.show()

In [None]:
# Prepare features and target
# Drop non-feature columns
feature_cols = [col for col in df.columns if col not in ['url', 'keyword', 'serp_rank', 'target_score', 'title']]
print(f"Feature columns ({len(feature_cols)}): {feature_cols}")

X = df[feature_cols].fillna(0)
y = df['target_score']

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature statistics:")
X.describe()

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

In [None]:
# Train XGBoost model
model = xgb.XGBRegressor(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbosity=1
)

model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
print("Model trained successfully")

In [None]:
# Evaluate model
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print("Training Metrics:")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_train, y_pred_train)):.4f}")
print(f"  MAE: {mean_absolute_error(y_train, y_pred_train):.4f}")
print(f"  R²: {r2_score(y_train, y_pred_train):.4f}")

print("\nTest Metrics:")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_test)):.4f}")
print(f"  MAE: {mean_absolute_error(y_test, y_pred_test):.4f}")
print(f"  R²: {r2_score(y_test, y_pred_test):.4f}")

# Cross-validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print(f"\nCross-validation R² scores: {cv_scores}")
print(f"Mean CV R²: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 15 Important Features:")
print(feature_importance.head(15))

# Plot
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'].head(15), feature_importance['importance'].head(15))
plt.xlabel('Importance')
plt.title('Top 15 Feature Importance')
plt.tight_layout()
plt.show()

In [None]:
# Prediction visualization
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.scatter(y_train, y_pred_train, alpha=0.5, label='Train')
plt.scatter(y_test, y_pred_test, alpha=0.5, label='Test')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2)
plt.xlabel('Actual Score')
plt.ylabel('Predicted Score')
plt.title('Actual vs Predicted')
plt.legend()

plt.subplot(1, 2, 2)
residuals = y_test - y_pred_test
plt.scatter(y_pred_test, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Score')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.tight_layout()
plt.show()

In [None]:
# Export model coefficients for JavaScript
# Extract tree structure and convert to linear approximation

model_config = {
    'version': '1.0.0-xgboost-trained',
    'createdAt': pd.Timestamp.now().isoformat(),
    'description': 'Trained XGBoost model using SERP ranking data',
    'metrics': {
        'train_rmse': float(np.sqrt(mean_squared_error(y_train, y_pred_train))),
        'test_rmse': float(np.sqrt(mean_squared_error(y_test, y_pred_test))),
        'test_r2': float(r2_score(y_test, y_pred_test)),
        'cv_r2_mean': float(cv_scores.mean()),
        'cv_r2_std': float(cv_scores.std())
    },
    'feature_importance': feature_importance.to_dict('records'),
    'training_samples': len(df),
    'model_type': 'XGBRegressor'
}

# Save model config
with open('trained_model_config.json', 'w', encoding='utf-8') as f:
    json.dump(model_config, f, ensure_ascii=False, indent=2)

print("Model config saved to trained_model_config.json")
print(json.dumps(model_config, ensure_ascii=False, indent=2))

In [None]:
# Save model for later use
model.save_model('trained_model.json')
print("Model saved to trained_model.json")

# Also save as pickle for Python use
import pickle
with open('trained_model.pkl', 'wb') as f:
    pickle.dump(model, f)
print("Model saved to trained_model.pkl")

In [None]:
# Generate predictions for all data
df['predicted_score'] = model.predict(X)
df['prediction_error'] = df['target_score'] - df['predicted_score']

print("Predictions added to dataframe")
print(df[['keyword', 'serp_rank', 'target_score', 'predicted_score', 'prediction_error']].head(10))

# Save predictions
df.to_csv('predictions.csv', index=False)
print("\nPredictions saved to predictions.csv")

In [None]:
# Summary
print("="*60)
print("MODEL TRAINING SUMMARY")
print("="*60)
print(f"Training samples: {len(df)}")
print(f"Features: {len(feature_cols)}")
print(f"\nBest metrics:")
print(f"  Test R²: {r2_score(y_test, y_pred_test):.4f}")
print(f"  Test RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_test)):.4f}")
print(f"  CV R² (mean): {cv_scores.mean():.4f}")
print(f"\nNext steps:")
print(f"  1. Download trained_model_config.json")
print(f"  2. Update scoring-model.js with new coefficients")
print(f"  3. Deploy to production")
print(f"  4. Monitor SERP performance")
print("="*60)