In [None]:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix
import json
import pickle

# ============================================================================
# 1. LINEAR REGRESSION FROM SCRATCH
# ============================================================================

class LinearRegressionScratch:
    """Linear Regression implementation using gradient descent"""

    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None
        self.losses = []

    def fit(self, X, y):
        """Train the linear regression model"""
        n_samples, n_features = X.shape

        # Initialize weights and bias
        self.weights = np.zeros(n_features)
        self.bias = 0

        # Gradient descent
        for i in range(self.n_iterations):
            # Forward pass
            y_predicted = np.dot(X, self.weights) + self.bias

            # Compute loss (MSE)
            loss = np.mean((y - y_predicted) ** 2)
            self.losses.append(loss)

            # Compute gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)

            # Update parameters
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

            if i % 100 == 0:
                print(f"Iteration {i}, Loss: {loss:.4f}")

    def predict(self, X):
        """Make predictions"""
        return np.dot(X, self.weights) + self.bias

    def save_model(self, filename):
        """Save model parameters"""
        model_data = {
            'weights': self.weights.tolist(),
            'bias': float(self.bias)
        }
        with open(filename, 'w') as f:
            json.dump(model_data, f)
        print(f"Linear Regression model saved to {filename}")


# ============================================================================
# 2. SVM FROM SCRATCH (using SMO algorithm simplified)
# ============================================================================

class SVMScratch:
    """Support Vector Machine implementation for classification"""

    def __init__(self, learning_rate=0.001, lambda_param=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.lambda_param = lambda_param
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None
        self.losses = []

    def fit(self, X, y):
        """Train the SVM model using gradient descent"""
        n_samples, n_features = X.shape

        # Convert labels to -1 and 1
        y_ = np.where(y <= 0, -1, 1)

        # Initialize weights and bias
        self.weights = np.zeros(n_features)
        self.bias = 0

        # Gradient descent
        for i in range(self.n_iterations):
            for idx, x_i in enumerate(X):
                condition = y_[idx] * (np.dot(x_i, self.weights) - self.bias) >= 1

                if condition:
                    # No misclassification
                    self.weights -= self.learning_rate * (2 * self.lambda_param * self.weights)
                else:
                    # Misclassification
                    self.weights -= self.learning_rate * (
                        2 * self.lambda_param * self.weights - np.dot(x_i, y_[idx])
                    )
                    self.bias -= self.learning_rate * y_[idx]

            # Calculate loss (hinge loss)
            loss = self._calculate_loss(X, y_)
            self.losses.append(loss)

            if i % 100 == 0:
                print(f"Iteration {i}, Loss: {loss:.4f}")

    def _calculate_loss(self, X, y):
        """Calculate hinge loss"""
        distances = 1 - y * (np.dot(X, self.weights) - self.bias)
        distances[distances < 0] = 0  # max(0, distance)
        hinge_loss = self.lambda_param * (np.linalg.norm(self.weights) ** 2) + np.mean(distances)
        return hinge_loss

    def predict(self, X):
        """Make predictions"""
        linear_output = np.dot(X, self.weights) - self.bias
        return np.sign(linear_output)

    def save_model(self, filename):
        """Save model parameters"""
        model_data = {
            'weights': self.weights.tolist(),
            'bias': float(self.bias)
        }
        with open(filename, 'w') as f:
            json.dump(model_data, f)
        print(f"SVM model saved to {filename}")


# ============================================================================
# 3. DATA GENERATION & PREPROCESSING
# ============================================================================

def generate_student_data(n_samples=1000):
    """Generate synthetic student performance data"""
    np.random.seed(42)

    data = {
        'attendance_percent': np.random.uniform(50, 100, n_samples),
        'assignment_avg': np.random.uniform(40, 100, n_samples),
        'quiz_avg': np.random.uniform(40, 100, n_samples),
        'midterm_score': np.random.uniform(40, 100, n_samples),
        'project_score': np.random.uniform(40, 100, n_samples),
        'study_hours_weekly': np.random.uniform(0, 20, n_samples),
        'participation_level': np.random.randint(1, 6, n_samples),
        'previous_gpa': np.random.uniform(1.5, 4.0, n_samples),
        'confidence_level': np.random.randint(1, 6, n_samples),
        'course_difficulty': np.random.randint(1, 6, n_samples),
    }

    df = pd.DataFrame(data)

    # Generate target: final score (weighted formula with noise)
    df['final_score'] = (
        0.25 * df['midterm_score'] +
        0.20 * df['assignment_avg'] +
        0.15 * df['quiz_avg'] +
        0.10 * df['project_score'] +
        0.05 * df['attendance_percent'] +
        (df['previous_gpa'] / 4.0) * 10 +
        (df['study_hours_weekly'] / 10.0) * 10 +
        (df['participation_level'] / 5.0) * 5 +
        (df['confidence_level'] - 3) * 1.0 +
        (3 - df['course_difficulty']) * 2.0 +
        np.random.normal(0, 5, n_samples)  # Add noise
    ).clip(0, 100)

    # Generate binary classification target: pass/fail (score >= 60)
    df['pass_fail'] = (df['final_score'] >= 60).astype(int)

    return df


def prepare_data(df):
    """Prepare data for training"""
    # Features
    feature_cols = [
        'attendance_percent', 'assignment_avg', 'quiz_avg', 'midterm_score',
        'project_score', 'study_hours_weekly', 'participation_level',
        'previous_gpa', 'confidence_level', 'course_difficulty'
    ]

    X = df[feature_cols].values
    y_regression = df['final_score'].values
    y_classification = df['pass_fail'].values

    # Split data
    X_train, X_test, y_reg_train, y_reg_test = train_test_split(
        X, y_regression, test_size=0.2, random_state=42
    )

    _, _, y_clf_train, y_clf_test = train_test_split(
        X, y_classification, test_size=0.2, random_state=42
    )

    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Save scaler
    with open('scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)

    return X_train_scaled, X_test_scaled, y_reg_train, y_reg_test, y_clf_train, y_clf_test, feature_cols


# ============================================================================
# 4. MODEL TRAINING & EVALUATION
# ============================================================================

def train_and_evaluate_models(X_train, X_test, y_reg_train, y_reg_test, y_clf_train, y_clf_test):
    """Train and evaluate both models"""

    print("=" * 80)
    print("TRAINING LINEAR REGRESSION MODEL")
    print("=" * 80)

    # Train Linear Regression
    lr_model = LinearRegressionScratch(learning_rate=0.01, n_iterations=1000)
    lr_model.fit(X_train, y_reg_train)

    # Evaluate Linear Regression
    y_pred_train = lr_model.predict(X_train)
    y_pred_test = lr_model.predict(X_test)

    train_mse = mean_squared_error(y_reg_train, y_pred_train)
    test_mse = mean_squared_error(y_reg_test, y_pred_test)
    train_r2 = r2_score(y_reg_train, y_pred_train)
    test_r2 = r2_score(y_reg_test, y_pred_test)

    print("\n--- Linear Regression Results ---")
    print(f"Training MSE: {train_mse:.4f}")
    print(f"Testing MSE: {test_mse:.4f}")
    print(f"Training R²: {train_r2:.4f}")
    print(f"Testing R²: {test_r2:.4f}")

    # Save Linear Regression model
    lr_model.save_model('linear_regression_model.json')

    print("\n" + "=" * 80)
    print("TRAINING SVM MODEL")
    print("=" * 80)

    # Train SVM
    svm_model = SVMScratch(learning_rate=0.001, lambda_param=0.01, n_iterations=1000)
    y_clf_train_binary = np.where(y_clf_train == 0, -1, 1)  # Convert to -1, 1
    svm_model.fit(X_train, y_clf_train_binary)

    # Evaluate SVM
    y_pred_train_svm = svm_model.predict(X_train)
    y_pred_test_svm = svm_model.predict(X_test)

    # Convert predictions back to 0, 1
    y_pred_train_svm = np.where(y_pred_train_svm == -1, 0, 1)
    y_pred_test_svm = np.where(y_pred_test_svm == -1, 0, 1)

    train_acc = accuracy_score(y_clf_train, y_pred_train_svm)
    test_acc = accuracy_score(y_clf_test, y_pred_test_svm)

    print("\n--- SVM Results ---")
    print(f"Training Accuracy: {train_acc:.4f}")
    print(f"Testing Accuracy: {test_acc:.4f}")

    # Confusion Matrix
    cm = confusion_matrix(y_clf_test, y_pred_test_svm)
    print("\nConfusion Matrix:")
    print(cm)

    # Save SVM model
    svm_model.save_model('svm_model.json')

    return lr_model, svm_model


# ============================================================================
# 5. VISUALIZATION
# ============================================================================

def plot_results(lr_model, svm_model, X_test, y_reg_test, y_clf_test):
    """Plot model results"""
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))

    # 1. Linear Regression: Training Loss
    axes[0, 0].plot(lr_model.losses, color='blue')
    axes[0, 0].set_title('Linear Regression Training Loss')
    axes[0, 0].set_xlabel('Iteration')
    axes[0, 0].set_ylabel('MSE Loss')
    axes[0, 0].grid(True, alpha=0.3)

    # 2. Linear Regression: Predictions vs Actual
    y_pred_test = lr_model.predict(X_test)
    axes[0, 1].scatter(y_reg_test, y_pred_test, alpha=0.5, color='green')
    axes[0, 1].plot([0, 100], [0, 100], 'r--', lw=2)
    axes[0, 1].set_title('Linear Regression: Predicted vs Actual')
    axes[0, 1].set_xlabel('Actual Score')
    axes[0, 1].set_ylabel('Predicted Score')
    axes[0, 1].grid(True, alpha=0.3)

    # 3. SVM: Training Loss
    axes[1, 0].plot(svm_model.losses, color='red')
    axes[1, 0].set_title('SVM Training Loss')
    axes[1, 0].set_xlabel('Iteration')
    axes[1, 0].set_ylabel('Hinge Loss')
    axes[1, 0].grid(True, alpha=0.3)

    # 4. SVM: Confusion Matrix Heatmap
    y_pred_test_svm = svm_model.predict(X_test)
    y_pred_test_svm = np.where(y_pred_test_svm == -1, 0, 1)
    cm = confusion_matrix(y_clf_test, y_pred_test_svm)

    im = axes[1, 1].imshow(cm, cmap='Blues')
    axes[1, 1].set_title('SVM Confusion Matrix')
    axes[1, 1].set_xlabel('Predicted')
    axes[1, 1].set_ylabel('Actual')
    axes[1, 1].set_xticks([0, 1])
    axes[1, 1].set_yticks([0, 1])
    axes[1, 1].set_xticklabels(['Fail', 'Pass'])
    axes[1, 1].set_yticklabels(['Fail', 'Pass'])

    # Add text annotations
    for i in range(2):
        for j in range(2):
            text = axes[1, 1].text(j, i, cm[i, j],
                                   ha="center", va="center", color="black", fontsize=20)

    plt.colorbar(im, ax=axes[1, 1])
    plt.tight_layout()
    plt.savefig('model_results.png', dpi=300, bbox_inches='tight')
    print("\nPlots saved as 'model_results.png'")
    plt.show()


# ============================================================================
# 6. FEATURE IMPORTANCE ANALYSIS
# ============================================================================

def analyze_feature_importance(lr_model, feature_cols):
    """Analyze and display feature importance"""
    print("\n" + "=" * 80)
    print("FEATURE IMPORTANCE ANALYSIS (Linear Regression)")
    print("=" * 80)

    # Get absolute weights
    weights = np.abs(lr_model.weights)

    # Sort features by importance
    importance_df = pd.DataFrame({
        'Feature': feature_cols,
        'Weight': lr_model.weights,
        'Absolute Weight': weights
    }).sort_values('Absolute Weight', ascending=False)

    print("\n", importance_df.to_string(index=False))

    # Plot feature importance
    plt.figure(figsize=(10, 6))
    plt.barh(importance_df['Feature'], importance_df['Absolute Weight'], color='skyblue')
    plt.xlabel('Absolute Weight (Importance)')
    plt.title('Feature Importance in Student Performance Prediction')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
    print("\nFeature importance plot saved as 'feature_importance.png'")
    plt.show()


# ============================================================================
# 7. MODEL PREDICTION FUNCTION
# ============================================================================

def predict_student_performance(lr_model, svm_model, scaler, student_data):
    """Make prediction for a new student"""
    # Prepare input
    X_new = np.array([list(student_data.values())])
    X_new_scaled = scaler.transform(X_new)

    # Linear Regression prediction
    predicted_score = lr_model.predict(X_new_scaled)[0]
    predicted_score = np.clip(predicted_score, 0, 100)

    # SVM prediction
    pass_fail_pred = svm_model.predict(X_new_scaled)[0]
    pass_fail = "Pass" if pass_fail_pred == 1 else "Fail"

    print("\n" + "=" * 80)
    print("STUDENT PERFORMANCE PREDICTION")
    print("=" * 80)
    print("\nInput Data:")
    for key, value in student_data.items():
        print(f"  {key}: {value}")

    print(f"\nPredicted Final Score: {predicted_score:.2f}%")
    print(f"Predicted Outcome: {pass_fail}")
    print("=" * 80)

    return predicted_score, pass_fail


# ============================================================================
# 8. MAIN EXECUTION
# ============================================================================

if __name__ == "__main__":
    print("=" * 80)
    print("STUDENT PERFORMANCE PREDICTION USING ML FROM SCRATCH")
    print("=" * 80)

    # Generate and prepare data
    print("\n1. Generating synthetic student data...")
    df = generate_student_data(n_samples=1000)
    print(f"Generated {len(df)} student records")
    print("\nData sample:")
    print(df.head())

    print("\n2. Preparing data for training...")
    X_train, X_test, y_reg_train, y_reg_test, y_clf_train, y_clf_test, feature_cols = prepare_data(df)
    print(f"Training samples: {len(X_train)}")
    print(f"Testing samples: {len(X_test)}")

    # Train models
    print("\n3. Training models...")
    lr_model, svm_model = train_and_evaluate_models(
        X_train, X_test, y_reg_train, y_reg_test, y_clf_train, y_clf_test
    )

    # Visualize results
    print("\n4. Creating visualizations...")
    plot_results(lr_model, svm_model, X_test, y_reg_test, y_clf_test)

    # Feature importance
    print("\n5. Analyzing feature importance...")
    analyze_feature_importance(lr_model, feature_cols)

    # Test prediction
    print("\n6. Testing prediction on a sample student...")
    with open('scaler.pkl', 'rb') as f:
        scaler = pickle.load(f)

    sample_student = {
        'attendance_percent': 85.0,
        'assignment_avg': 78.0,
        'quiz_avg': 75.0,
        'midterm_score': 72.0,
        'project_score': 80.0,
        'study_hours_weekly': 8.0,
        'participation_level': 4,
        'previous_gpa': 3.2,
        'confidence_level': 4,
        'course_difficulty': 3
    }

    predict_student_performance(lr_model, svm_model, scaler, sample_student)

    print("\n" + "=" * 80)
    print("TRAINING COMPLETE!")
    print("=" * 80)
    print("\nGenerated files:")
    print("  - linear_regression_model.json")
    print("  - svm_model.json")
    print("  - scaler.pkl")
    print("  - model_results.png")
    print("  - feature_importance.png")
    print("=" * 80)