In [None]:


import pandas as pd
from sklearn.model_selection import train_test_split

# Load processed dataset
df = pd.read_csv("../data/processed/processed_dataset.csv")

# Create target and features
target = 'stars_period'
y = df[target]
X = df.drop(columns=[target])

# Split into train and test sets (80/20)
#80% used for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,
    random_state=42
)

print("‚úÖ Train/Test Split Complete!")
print(f"Training set: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"Testing set:  {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")
print(f"\nShapes:")
print(f"  X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"  X_test:  {X_test.shape}, y_test:  {y_test.shape}")



‚úÖ Train/Test Split Complete!
Training set: 1269 samples (80.0%)
Testing set:  318 samples (20.0%)

Shapes:
  X_train: (1269, 16), y_train: (1269,)
  X_test:  (318, 16), y_test:  (318,)


In [3]:
#  simple baseline model using linear regression and random forest classifier

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Linear Regression Model 
lr_model = LinearRegression()

lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)


# Random Forest Regressor Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Evaluate Models

def evaluate_model(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"--- {model_name} ---")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"R^2 Score: {r2:.2f}\n")

evaluate_model(y_test, y_pred_lr, "Linear Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest Regressor")

--- Linear Regression ---
Mean Squared Error: 1283742.76
R^2 Score: 0.01

--- Random Forest Regressor ---
Mean Squared Error: 1672773.27
R^2 Score: -0.29



In [4]:
# XGBoost Model - Better performance for regression
# Install if needed: !pip install xgboost

try:
    import xgboost as xgb
except ImportError:
    print("Installing xgboost...")
    import subprocess
    subprocess.check_call(["pip", "install", "xgboost"])
    import xgboost as xgb

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

print("=" * 60)
print("XGBOOST MODEL - Training & Evaluation")
print("=" * 60)

# XGBoost with tuned hyperparameters
xgb_model = xgb.XGBRegressor(
    n_estimators=200,           # Number of trees
    max_depth=6,                # Maximum tree depth
    learning_rate=0.1,         # Learning rate
    subsample=0.8,              # Row sampling
    colsample_bytree=0.8,       # Column sampling
    min_child_weight=3,         # Minimum samples in leaf
    random_state=42,
    n_jobs=-1                   # Use all CPU cores
)

# Train the model
print("\nüîÑ Training XGBoost model...")
xgb_model.fit(X_train, y_train)

# Make predictions
xgb_train_pred = xgb_model.predict(X_train)
xgb_test_pred = xgb_model.predict(X_test)

# Calculate metrics
xgb_train_mse = mean_squared_error(y_train, xgb_train_pred)
xgb_test_mse = mean_squared_error(y_test, xgb_test_pred)
xgb_train_rmse = np.sqrt(xgb_train_mse)
xgb_test_rmse = np.sqrt(xgb_test_mse)
xgb_train_mae = mean_absolute_error(y_train, xgb_train_pred)
xgb_test_mae = mean_absolute_error(y_test, xgb_test_pred)
xgb_train_r2 = r2_score(y_train, xgb_train_pred)
xgb_test_r2 = r2_score(y_test, xgb_test_pred)

print("\nüìä XGBoost Results:")
print(f"  Training Set:")
print(f"    MSE:  {xgb_train_mse:.2f}")
print(f"    RMSE: {xgb_train_rmse:.2f}")
print(f"    MAE:  {xgb_train_mae:.2f}")
print(f"    R¬≤:   {xgb_train_r2:.4f}")
print(f"\n  Test Set:")
print(f"    MSE:  {xgb_test_mse:.2f}")
print(f"    RMSE: {xgb_test_rmse:.2f}")
print(f"    MAE:  {xgb_test_mae:.2f}")
print(f"    R¬≤:   {xgb_test_r2:.4f}")

print("\n" + "=" * 60)
print("‚úÖ XGBoost model trained!")
print("=" * 60)


Installing xgboost...
XGBOOST MODEL - Training & Evaluation

üîÑ Training XGBoost model...

üìä XGBoost Results:
  Training Set:
    MSE:  54455.96
    RMSE: 233.36
    MAE:  102.45
    R¬≤:   0.9745

  Test Set:
    MSE:  1400337.62
    RMSE: 1183.36
    MAE:  358.98
    R¬≤:   -0.0812

‚úÖ XGBoost model trained!


In [5]:
# Compare All Models Side-by-Side
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

print("=" * 70)
print("MODEL COMPARISON - All Models")
print("=" * 70)

# Get predictions from all models (if they exist)
models_comparison = []

# Linear Regression
if 'y_pred_lr' in globals():
    lr_mse = mean_squared_error(y_test, y_pred_lr)
    lr_rmse = np.sqrt(lr_mse)
    lr_r2 = r2_score(y_test, y_pred_lr)
    lr_mae = mean_absolute_error(y_test, y_pred_lr)
    models_comparison.append(("Linear Regression", lr_mse, lr_rmse, lr_mae, lr_r2))

# Random Forest
if 'y_pred_rf' in globals():
    rf_mse = mean_squared_error(y_test, y_pred_rf)
    rf_rmse = np.sqrt(rf_mse)
    rf_r2 = r2_score(y_test, y_pred_rf)
    rf_mae = mean_absolute_error(y_test, y_pred_rf)
    models_comparison.append(("Random Forest", rf_mse, rf_rmse, rf_mae, rf_r2))

# XGBoost
if 'xgb_test_pred' in globals():
    models_comparison.append(("XGBoost", xgb_test_mse, xgb_test_rmse, xgb_test_mae, xgb_test_r2))

# Print comparison table
print(f"\n{'Model':<20} {'MSE':<15} {'RMSE':<15} {'MAE':<15} {'R¬≤':<10}")
print("-" * 70)
for name, mse, rmse, mae, r2 in models_comparison:
    print(f"{name:<20} {mse:<15.2f} {rmse:<15.2f} {mae:<15.2f} {r2:<10.4f}")

# Find best model
if models_comparison:
    best_model = min(models_comparison, key=lambda x: x[1])  # Lowest MSE
    print(f"\nüèÜ Best Model: {best_model[0]} (Lowest MSE: {best_model[1]:.2f})")
    
print("=" * 70)


MODEL COMPARISON - All Models

Model                MSE             RMSE            MAE             R¬≤        
----------------------------------------------------------------------
Linear Regression    1283742.76      1133.02         396.80          0.0088    
Random Forest        1672773.27      1293.36         308.79          -0.2916   
XGBoost              1400337.62      1183.36         358.98          -0.0812   

üèÜ Best Model: Linear Regression (Lowest MSE: 1283742.76)


In [6]:
# XGBoost Feature Importance - See which features matter most
if 'xgb_model' in globals():
    import pandas as pd
    
    # Get feature importance
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': xgb_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("=" * 60)
    print("XGBOOST FEATURE IMPORTANCE")
    print("=" * 60)
    print("\nTop 10 Most Important Features:")
    print("-" * 60)
    for idx, row in feature_importance.head(10).iterrows():
        print(f"{row['feature']:<30} {row['importance']:.4f}")
    
    print("\n" + "=" * 60)


XGBOOST FEATURE IMPORTANCE

Top 10 Most Important Features:
------------------------------------------------------------
log_forks                      0.2443
forks                          0.2257
timeframe_encoded              0.1110
normalized_forks               0.0882
stars                          0.0733
log_stars                      0.0681
search_language_encoded        0.0528
popularity_score               0.0518
language_encoded               0.0480
stars_forks_ratio              0.0260

