## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb

# Model Persistence
import joblib
import os

## 2. Load Dataset

In [None]:
# Load data
df = pd.read_csv('../data/repositories.csv')
print(f"Dataset Shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

## 3. Data Preprocessing

### 3.1 Handle Missing Values

In [None]:
# Check missing values
print("Missing Values Before Preprocessing:")
print(df.isnull().sum())
print(f"\nTotal missing: {df.isnull().sum().sum()}")

In [None]:
# Handle missing values
# Language: Fill with 'Unknown'
if 'Language' in df.columns:
    df['Language'] = df['Language'].fillna('Unknown')

# License: Fill with 'No License'
if 'License' in df.columns:
    df['License'] = df['License'].fillna('No License')

# For other columns, fill numeric with median, categorical with mode
for col in df.columns:
    if df[col].isnull().sum() > 0:
        if df[col].dtype in ['float64', 'int64']:
            df[col].fillna(df[col].median(), inplace=True)
        else:
            df[col].fillna(df[col].mode()[0], inplace=True)

print("\nMissing Values After Preprocessing:")
print(df.isnull().sum().sum())

### 3.2 Remove Multicollinearity

**Issue:** Watchers ‚âà Stars (correlation ‚âà 0.99) causes perfect multicollinearity.  
**Solution:** Drop Watchers column.

In [None]:
# Drop Watchers column (redundant with Stars)
if 'Watchers' in df.columns:
    print("Dropping 'Watchers' column (multicollinearity with Stars)")
    df = df.drop('Watchers', axis=1)
    
print(f"\nDataset shape after dropping Watchers: {df.shape}")

### 3.3 Feature Selection for Modeling

In [None]:
# Select relevant features for modeling
# Target: Stars
# Features: Numerical + Top categorical features

# Define feature columns
numerical_features = ['Forks', 'Open Issues', 'Size']
categorical_features = ['Language', 'Has Wiki', 'Has Issues', 'Has Projects']

# Keep only relevant columns
columns_to_keep = ['Stars'] + numerical_features + categorical_features

# Filter columns that exist in dataframe
columns_to_keep = [col for col in columns_to_keep if col in df.columns]

df_model = df[columns_to_keep].copy()

print(f"Features for modeling: {df_model.columns.tolist()}")
print(f"Dataset shape: {df_model.shape}")

### 3.4 Handle Extreme Skewness

Apply log transformation to highly skewed numerical features.

In [None]:
# Log transform skewed numerical features (add 1 to avoid log(0))
for col in numerical_features:
    if col in df_model.columns:
        df_model[f'{col}_log'] = np.log1p(df_model[col])
        print(f"Created log-transformed feature: {col}_log")

# Also transform target variable
df_model['Stars_log'] = np.log1p(df_model['Stars'])

print(f"\nDataset shape after feature engineering: {df_model.shape}")

### 3.5 Encode Categorical Variables

In [None]:
# Encode categorical variables
label_encoders = {}

for col in categorical_features:
    if col in df_model.columns:
        le = LabelEncoder()
        df_model[f'{col}_encoded'] = le.fit_transform(df_model[col].astype(str))
        label_encoders[col] = le
        print(f"Encoded {col}: {len(le.classes_)} unique values")

print(f"\nTotal label encoders saved: {len(label_encoders)}")

### 3.6 Prepare Features and Target

In [None]:
# Select final features for modeling
feature_columns = [col for col in df_model.columns if ('_log' in col or '_encoded' in col) and col != 'Stars_log']

X = df_model[feature_columns]
y = df_model['Stars_log']  # Use log-transformed target

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")
print(f"\nFeatures used: {X.columns.tolist()}")

### 3.7 Train-Test Split

In [None]:
# Split data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
print(f"Train/Test split: {X_train.shape[0]/len(X)*100:.1f}% / {X_test.shape[0]/len(X)*100:.1f}%")

### 3.8 Feature Scaling

In [None]:
# Standardize features (mean=0, std=1)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Feature scaling completed")
print(f"Scaled training set shape: {X_train_scaled.shape}")
print(f"Scaled testing set shape: {X_test_scaled.shape}")

## 4. Model Training and Evaluation

Train multiple models and compare performance.

In [None]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.1),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)
}

print(f"Total models to train: {len(models)}")

In [None]:
# Train and evaluate all models
results = {}

for name, model in models.items():
    print(f"\n{'='*60}")
    print(f"Training {name}...")
    print(f"{'='*60}")
    
    # Train model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred_train = model.predict(X_train_scaled)
    y_pred_test = model.predict(X_test_scaled)
    
    # Calculate metrics
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    test_mae = mean_absolute_error(y_test, y_pred_test)
    
    # Store results
    results[name] = {
        'model': model,
        'train_r2': train_r2,
        'test_r2': test_r2,
        'test_rmse': test_rmse,
        'test_mae': test_mae
    }
    
    print(f"Training R¬≤: {train_r2:.4f}")
    print(f"Testing R¬≤: {test_r2:.4f}")
    print(f"Testing RMSE: {test_rmse:.4f}")
    print(f"Testing MAE: {test_mae:.4f}")

## 5. Model Comparison

In [None]:
# Create comparison DataFrame
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Train R¬≤': [results[m]['train_r2'] for m in results],
    'Test R¬≤': [results[m]['test_r2'] for m in results],
    'Test RMSE': [results[m]['test_rmse'] for m in results],
    'Test MAE': [results[m]['test_mae'] for m in results]
})

comparison_df = comparison_df.sort_values('Test R¬≤', ascending=False)
print("\n" + "="*80)
print("MODEL COMPARISON RESULTS")
print("="*80)
print(comparison_df.to_string(index=False))
print("\n" + "="*80)

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# R¬≤ Score comparison
comparison_df_sorted = comparison_df.sort_values('Test R¬≤')
axes[0].barh(comparison_df_sorted['Model'], comparison_df_sorted['Test R¬≤'], color='steelblue')
axes[0].set_xlabel('R¬≤ Score')
axes[0].set_title('Model Performance Comparison (R¬≤ Score)')
axes[0].grid(axis='x', alpha=0.3)

# RMSE comparison
comparison_df_sorted_rmse = comparison_df.sort_values('Test RMSE', ascending=False)
axes[1].barh(comparison_df_sorted_rmse['Model'], comparison_df_sorted_rmse['Test RMSE'], color='coral')
axes[1].set_xlabel('RMSE')
axes[1].set_title('Model Performance Comparison (RMSE - Lower is Better)')
axes[1].grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Select Best Model

In [None]:
# Select best model based on Test R¬≤
best_model_name = comparison_df.iloc[0]['Model']
best_model = results[best_model_name]['model']

print(f"\nüèÜ BEST MODEL: {best_model_name}")
print(f"\nPerformance Metrics:")
print(f"  ‚Ä¢ Training R¬≤: {results[best_model_name]['train_r2']:.4f}")
print(f"  ‚Ä¢ Testing R¬≤: {results[best_model_name]['test_r2']:.4f}")
print(f"  ‚Ä¢ Testing RMSE: {results[best_model_name]['test_rmse']:.4f}")
print(f"  ‚Ä¢ Testing MAE: {results[best_model_name]['test_mae']:.4f}")

## 7. Feature Importance (for tree-based models)

In [None]:
# Show feature importance if model supports it
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("\nFeature Importance:")
    print(feature_importance.to_string(index=False))
    
    # Visualize
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance['Feature'], feature_importance['Importance'], color='forestgreen')
    plt.xlabel('Importance')
    plt.title(f'Feature Importance - {best_model_name}')
    plt.gca().invert_yaxis()
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print(f"\nNote: {best_model_name} does not support feature importance extraction.")

## 8. Prediction vs Actual Analysis

In [None]:
# Predictions on test set
y_pred_best = best_model.predict(X_test_scaled)

# Convert back from log scale
y_test_actual = np.expm1(y_test)
y_pred_actual = np.expm1(y_pred_best)

# Create scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(y_test_actual, y_pred_actual, alpha=0.3, s=10)
plt.plot([y_test_actual.min(), y_test_actual.max()], 
         [y_test_actual.min(), y_test_actual.max()], 
         'r--', lw=2, label='Perfect Prediction')
plt.xlabel('Actual Stars')
plt.ylabel('Predicted Stars')
plt.title(f'Predicted vs Actual Stars - {best_model_name}')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## 9. Save Models and Preprocessors

Save all necessary objects for deployment in Streamlit.

In [None]:
# Create models directory if it doesn't exist
models_dir = '../models'
os.makedirs(models_dir, exist_ok=True)

# Save best model
joblib.dump(best_model, f'{models_dir}/best_model.pkl')
print(f"‚úÖ Saved best model: {best_model_name}")

# Save scaler
joblib.dump(scaler, f'{models_dir}/scaler.pkl')
print(f"‚úÖ Saved scaler")

# Save label encoders
joblib.dump(label_encoders, f'{models_dir}/label_encoders.pkl')
print(f"‚úÖ Saved label encoders")

# Save feature columns
joblib.dump(feature_columns, f'{models_dir}/feature_columns.pkl')
print(f"‚úÖ Saved feature columns")

# Save model metadata
metadata = {
    'model_name': best_model_name,
    'test_r2': results[best_model_name]['test_r2'],
    'test_rmse': results[best_model_name]['test_rmse'],
    'test_mae': results[best_model_name]['test_mae'],
    'training_samples': X_train.shape[0],
    'testing_samples': X_test.shape[0],
    'features': feature_columns
}
joblib.dump(metadata, f'{models_dir}/model_metadata.pkl')
print(f"‚úÖ Saved model metadata")

# Save all models for comparison
for name, result in results.items():
    model_filename = name.lower().replace(' ', '_')
    joblib.dump(result['model'], f'{models_dir}/{model_filename}.pkl')
    print(f"‚úÖ Saved {name}")

print(f"\n{'='*60}")
print(f"All models and preprocessors saved to '{models_dir}/'")
print(f"{'='*60}")

## 10. Summary

### Data Preprocessing Steps Completed:
1. ‚úÖ Handled missing values (Language, License)
2. ‚úÖ Removed multicollinearity (dropped Watchers)
3. ‚úÖ Log-transformed skewed features
4. ‚úÖ Encoded categorical variables
5. ‚úÖ Scaled numerical features
6. ‚úÖ Split into train/test sets (80/20)

### Models Trained:
1. Linear Regression
2. Ridge Regression
3. Lasso Regression
4. Random Forest
5. Gradient Boosting
6. XGBoost

### Next Steps:
- Deploy best model in Streamlit application
- Enable runtime predictions based on user input
- Visualize model performance interactively