# LeetCode Rating Predictor

Predict missing difficulty ratings for LeetCode problems using Gradient Boosting Regression.

**Approach:**
1. Baseline: TF-IDF + Tags + Metadata → GradientBoostingRegressor
2. Enhanced: Add SentenceTransformer embeddings
3. Compare performance and generate predictions

**Datasets:**
- `zerotrac.json`: 2,405 problems with ratings (PRIMARY)
- `lcid.json`: 3,807 problems with metadata (tags, acRate)
- `merged_problems.json`: 2,913 problems with descriptions

## 1. Imports & Setup

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sentence_transformers import SentenceTransformer
import warnings

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
%matplotlib inline

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

print("Imports complete")

## 2. Data Loading

In [None]:
# Load zerotrac (has ratings)
with open('source/zerotrac.json', 'r') as f:
    zerotrac_data = json.load(f)
df_zerotrac = pd.DataFrame(zerotrac_data)
print(f"Zerotrac: {len(df_zerotrac)} problems with ratings")

# Load lcid (has metadata)
with open('source/lcid.json', 'r') as f:
    lcid_data = json.load(f)
# Convert dict to DataFrame
lcid_rows = [{'problem_id': k, **v} for k, v in lcid_data.items()]
df_lcid = pd.DataFrame(lcid_rows)
print(f"LCID: {len(df_lcid)} problems with metadata")

# Load merged_problems (has descriptions)
with open('source/merged_problems.json', 'r') as f:
    merged_data = json.load(f)
df_problems = pd.DataFrame(merged_data['questions'])
print(f"Merged Problems: {len(df_problems)} problems with descriptions")

print("\nData loaded successfully")

## 3. Data Merging & Exploration

In [None]:
# Normalize TitleSlug for merging
df_zerotrac['slug'] = df_zerotrac['TitleSlug'].str.lower().str.strip()
df_lcid['slug'] = df_lcid['titleSlug'].str.lower().str.strip()
df_problems['slug'] = df_problems['problem_slug'].str.lower().str.strip()

# Merge step 1: zerotrac + lcid
df_merged = df_zerotrac.merge(df_lcid, on='slug', how='outer', suffixes=('_zt', '_lcid'))
print(f"After zerotrac + lcid merge: {len(df_merged)} problems")

# Merge step 2: + merged_problems
df_merged = df_merged.merge(df_problems, on='slug', how='outer')
print(f"After adding descriptions: {len(df_merged)} problems")

# Check for duplicates
assert df_merged['slug'].duplicated().sum() == 0, "Found duplicate slugs!"
print("No duplicate slugs")

# Analyze rating coverage
has_rating = df_merged['Rating'].notna()
has_description = df_merged['description'].notna()
usable_for_training = has_rating & has_description

print(f"\nData Coverage:")
print(f"  - Problems with ratings: {has_rating.sum()}")
print(f"  - Problems with descriptions: {has_description.sum()}")
print(f"  - Usable for training (both): {usable_for_training.sum()}")
print(f"  - Missing ratings (targets): {(~has_rating & has_description).sum()}")

In [None]:
# EDA: Rating distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

# Histogram
axes[0].hist(df_merged[has_rating]['Rating'], bins=30, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Rating')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Rating Distribution')
axes[0].axvline(df_merged[has_rating]['Rating'].mean(), color='red', linestyle='--', label='Mean')
axes[0].legend()

# Box plot by difficulty
df_with_rating = df_merged[has_rating].copy()
df_with_rating['difficulty_label'] = df_with_rating['difficulty'].fillna('Unknown')
df_with_rating.boxplot(column='Rating', by='difficulty_label', ax=axes[1])
axes[1].set_xlabel('Difficulty')
axes[1].set_ylabel('Rating')
axes[1].set_title('Rating by Difficulty')
plt.suptitle('')

plt.tight_layout()
plt.show()

print(f"\nRating Stats:")
print(df_merged[has_rating]['Rating'].describe())

## 4. Feature Engineering

In [None]:
# Filter to problems with both rating and description
df_train_pool = df_merged[usable_for_training].copy()
print(f"Training pool: {len(df_train_pool)} problems\n")

# 4.1 Text Features: TF-IDF on descriptions
print("Generating TF-IDF features...")
tfidf = TfidfVectorizer(max_features=500, stop_words='english', min_df=2)
tfidf_matrix = tfidf.fit_transform(df_train_pool['description'].fillna(''))
print(f"   TF-IDF shape: {tfidf_matrix.shape}")

# 4.2 Categorical Features: One-hot encode tags
print("Encoding topic tags...")
# Extract tags from topicTags (list of dicts with 'name' key)
def extract_tags(tags):
    if pd.isna(tags) or not isinstance(tags, list):
        return []
    return [tag.get('name', tag) if isinstance(tag, dict) else tag for tag in tags]

df_train_pool['tag_list'] = df_train_pool['topicTags'].apply(extract_tags)
mlb = MultiLabelBinarizer()
tags_matrix = mlb.fit_transform(df_train_pool['tag_list'])
print(f"   Tags shape: {tags_matrix.shape} ({len(mlb.classes_)} unique tags)")

# 4.3 Numerical Features: acRate, totalAccepted (if available)
print("Processing numerical features...")
numerical_features = []
if 'acRate' in df_train_pool.columns:
    numerical_features.append('acRate')
# Note: totalAccepted not in lcid.json, skip if missing

if numerical_features:
    scaler = StandardScaler()
    numerical_matrix = scaler.fit_transform(df_train_pool[numerical_features].fillna(0))
    print(f"   Numerical shape: {numerical_matrix.shape}")
else:
    numerical_matrix = np.zeros((len(df_train_pool), 0))
    scaler = None
    print(f"   No numerical features available")

print("\nFeature engineering complete")

In [None]:
# Combine features for baseline model
from scipy.sparse import hstack, csr_matrix

X_baseline = hstack([
    tfidf_matrix,
    csr_matrix(tags_matrix),
    csr_matrix(numerical_matrix)
])

y = df_train_pool['Rating'].values

print(f"Baseline feature matrix: {X_baseline.shape}")
print(f"Target vector: {y.shape}")

## 5. Baseline Model (TF-IDF + Tags + Numerical)

In [None]:
# Train/test split
X_train, X_val, y_train, y_val = train_test_split(
    X_baseline, y, test_size=0.2, random_state=RANDOM_SEED
)

print(f"Training set: {X_train.shape[0]} problems")
print(f"Validation set: {X_val.shape[0]} problems")

# Train Gradient Boosting Regressor
print("\nTraining baseline model...")
model_baseline = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=RANDOM_SEED,
    verbose=0
)
model_baseline.fit(X_train, y_train)

# Evaluate
y_pred_baseline = model_baseline.predict(X_val)
rmse_baseline = np.sqrt(mean_squared_error(y_val, y_pred_baseline))
mae_baseline = mean_absolute_error(y_val, y_pred_baseline)
r2_baseline = r2_score(y_val, y_pred_baseline)

print(f"\nBaseline Model Performance:")
print(f"   RMSE: {rmse_baseline:.2f}")
print(f"   MAE:  {mae_baseline:.2f}")
print(f"   R²:   {r2_baseline:.3f}")

# Success criteria check
if rmse_baseline < 100:
    print("   RMSE < 100 (target met)")
else:
    print(f"   RMSE > 100 (target: <100)")

if r2_baseline > 0.75:
    print("   R² > 0.75 (target met)")
else:
    print(f"   R² < 0.75 (target: >0.75)")

## 6. Enhanced Model (+ Embeddings)

In [None]:
# Generate embeddings using SentenceTransformer
print("Loading SentenceTransformer model (this may take a moment)...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("   Model loaded. Generating embeddings...")

descriptions = df_train_pool['description'].fillna('').tolist()
embeddings = embedding_model.encode(descriptions, show_progress_bar=True, batch_size=32)
print(f"   Embeddings shape: {embeddings.shape}")

# Combine baseline features + embeddings
X_enhanced = hstack([
    X_baseline,
    csr_matrix(embeddings)
])

print(f"\nEnhanced feature matrix: {X_enhanced.shape}")

In [None]:
# Train/test split for enhanced model
X_train_enh, X_val_enh, y_train_enh, y_val_enh = train_test_split(
    X_enhanced, y, test_size=0.2, random_state=RANDOM_SEED
)

# Train enhanced model
print("Training enhanced model...")
model_enhanced = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=RANDOM_SEED,
    verbose=0
)
model_enhanced.fit(X_train_enh, y_train_enh)

# Evaluate
y_pred_enhanced = model_enhanced.predict(X_val_enh)
rmse_enhanced = np.sqrt(mean_squared_error(y_val_enh, y_pred_enhanced))
mae_enhanced = mean_absolute_error(y_val_enh, y_pred_enhanced)
r2_enhanced = r2_score(y_val_enh, y_pred_enhanced)

print(f"\nEnhanced Model Performance:")
print(f"   RMSE: {rmse_enhanced:.2f}")
print(f"   MAE:  {mae_enhanced:.2f}")
print(f"   R²:   {r2_enhanced:.3f}")

# Compare with baseline
print(f"\nComparison (Baseline → Enhanced):")
print(f"   RMSE: {rmse_baseline:.2f} → {rmse_enhanced:.2f} ({rmse_enhanced - rmse_baseline:+.2f})")
print(f"   MAE:  {mae_baseline:.2f} → {mae_enhanced:.2f} ({mae_enhanced - mae_baseline:+.2f})")
print(f"   R²:   {r2_baseline:.3f} → {r2_enhanced:.3f} ({r2_enhanced - r2_baseline:+.3f})")

# Success criteria
if rmse_enhanced < 100:
    print("   RMSE < 100 (target met)")
if mae_enhanced < 75:
    print("   MAE < 75 (target met)")
if r2_enhanced > 0.75:
    print("   R² > 0.75 (target met)")

## 7. Prediction & Output

In [None]:
# Identify problems missing ratings
df_missing = df_merged[~has_rating & has_description].copy()
print(f"Problems missing ratings: {len(df_missing)}")

if len(df_missing) > 0:
    # Generate features for missing problems
    print("\nGenerating predictions for missing ratings...")
    
    # TF-IDF
    tfidf_missing = tfidf.transform(df_missing['description'].fillna(''))
    
    # Tags
    df_missing['tag_list'] = df_missing['topicTags'].apply(extract_tags)
    tags_missing = mlb.transform(df_missing['tag_list'])
    
    # Numerical
    if numerical_features and scaler:
        numerical_missing = scaler.transform(df_missing[numerical_features].fillna(0))
    else:
        numerical_missing = np.zeros((len(df_missing), 0))
    
    # Embeddings
    embeddings_missing = embedding_model.encode(
        df_missing['description'].fillna('').tolist(),
        show_progress_bar=True,
        batch_size=32
    )
    
    # Combine features
    X_missing = hstack([
        tfidf_missing,
        csr_matrix(tags_missing),
        csr_matrix(numerical_missing),
        csr_matrix(embeddings_missing)
    ])
    
    # Predict using enhanced model
    predicted_ratings = model_enhanced.predict(X_missing)
    df_missing['predicted_rating'] = predicted_ratings
    
    print(f"\nPredictions generated")
    print(f"\nPredicted Rating Stats:")
    print(df_missing['predicted_rating'].describe())
    
    # Show sample predictions
    print(f"\nSample Predictions:")
    sample_cols = ['Title', 'TitleSlug', 'difficulty', 'predicted_rating']
    available_cols = [c for c in sample_cols if c in df_missing.columns]
    print(df_missing[available_cols].head(10).to_string(index=False))
else:
    print("No problems missing ratings")

In [None]:
# Save augmented dataset
if len(df_missing) > 0:
    # Merge predictions back into main dataset
    df_merged['predicted_rating'] = np.nan
    df_merged.loc[df_missing.index, 'predicted_rating'] = df_missing['predicted_rating']
    
    # Create final rating column (use actual if available, else predicted)
    df_merged['final_rating'] = df_merged['Rating'].fillna(df_merged['predicted_rating'])
    
    # Save to JSON
    output_file = 'generated/merged_problems_with_ratings.json'
    
    # Convert to dict format similar to merged_problems.json
    output_data = {
        'questions': df_merged.to_dict(orient='records')
    }
    
    with open(output_file, 'w') as f:
        json.dump(output_data, f, indent=2)
    
    print(f"Saved augmented dataset to: {output_file}")
    print(f"   Total problems: {len(df_merged)}")
    print(f"   With actual ratings: {df_merged['Rating'].notna().sum()}")
    print(f"   With predicted ratings: {df_merged['predicted_rating'].notna().sum()}")
    print(f"   Total with ratings (actual or predicted): {df_merged['final_rating'].notna().sum()}")

## 8. Visualization

In [None]:
# 8.1 Predicted vs Actual (Scatter Plot)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Baseline model
axes[0].scatter(y_val, y_pred_baseline, alpha=0.5, s=20)
axes[0].plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2, label='Perfect prediction')
axes[0].set_xlabel('Actual Rating')
axes[0].set_ylabel('Predicted Rating')
axes[0].set_title(f'Baseline Model (RMSE: {rmse_baseline:.2f}, R²: {r2_baseline:.3f})')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Enhanced model
axes[1].scatter(y_val_enh, y_pred_enhanced, alpha=0.5, s=20, color='green')
axes[1].plot([y_val_enh.min(), y_val_enh.max()], [y_val_enh.min(), y_val_enh.max()], 'r--', lw=2, label='Perfect prediction')
axes[1].set_xlabel('Actual Rating')
axes[1].set_ylabel('Predicted Rating')
axes[1].set_title(f'Enhanced Model (RMSE: {rmse_enhanced:.2f}, R²: {r2_enhanced:.3f})')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# 8.2 Residual Plot
residuals_baseline = y_val - y_pred_baseline
residuals_enhanced = y_val_enh - y_pred_enhanced

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Baseline
axes[0].scatter(y_pred_baseline, residuals_baseline, alpha=0.5, s=20)
axes[0].axhline(y=0, color='r', linestyle='--', lw=2)
axes[0].set_xlabel('Predicted Rating')
axes[0].set_ylabel('Residual (Actual - Predicted)')
axes[0].set_title('Baseline Model Residuals')
axes[0].grid(True, alpha=0.3)

# Enhanced
axes[1].scatter(y_pred_enhanced, residuals_enhanced, alpha=0.5, s=20, color='green')
axes[1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1].set_xlabel('Predicted Rating')
axes[1].set_ylabel('Residual (Actual - Predicted)')
axes[1].set_title('Enhanced Model Residuals')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Residual mean (should be ~0):")
print(f"  Baseline: {residuals_baseline.mean():.2f}")
print(f"  Enhanced: {residuals_enhanced.mean():.2f}")

In [None]:
# 8.3 Rating Distribution Comparison
if len(df_missing) > 0:
    fig, ax = plt.subplots(figsize=(12, 5))
    
    ax.hist(df_merged[has_rating]['Rating'], bins=30, alpha=0.5, label='Known Ratings', edgecolor='black')
    ax.hist(df_missing['predicted_rating'], bins=30, alpha=0.5, label='Predicted Ratings', edgecolor='black')
    ax.set_xlabel('Rating')
    ax.set_ylabel('Frequency')
    ax.set_title('Distribution: Known vs Predicted Ratings')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    print("No predicted ratings to visualize.")

## Summary

Implementation complete.

**Model Performance:**
- Baseline (TF-IDF + Tags): See metrics above
- Enhanced (+ Embeddings): See metrics above

**Next Steps:**
1. Review the scatter plots - predictions should cluster near the diagonal
2. Check residual plots - errors should be randomly distributed around 0
3. Verify predicted ratings align with difficulty labels
4. Use `generated/merged_problems_with_ratings.json` in your application