<a href="https://colab.research.google.com/github/ashiyaaa121/AI-worksheet/blob/main/Workshop5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

# Custom function to split data into training and testing sets
def train_test_split(X, y, test_size=0.3, random_state=42):

    # Set random seed so that results are reproducible
    np.random.seed(random_state)

    # Create an array of indices based on number of samples in X
    indices = np.arange(X.shape[0])

    # Shuffle the indices randomly
    np.random.shuffle(indices)

    # Calculate number of samples for the test set
    test_split_size = int(len(X) * test_size)

    # Split indices into training and testing
    train_indices = indices[:test_split_size]
    test_indices = indices[test_split_size:]

    # Create training and testing datasets using the indices
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]

    # Return the split datasets
    return X_train, X_test, y_train, y_test


In [4]:
def cost_function(X, Y, W):

    # Compute predicted values
    y_pred = np.matmul(X, W)

    # Calculate squared error for each data point
    squared_errors = [(y_pred[i] - Y[i]) ** 2 for i in range(len(Y))]

    # Number of samples
    m = len(Y)

    # Compute mean squared error cost
    cost = sum(squared_errors) / (2 * m)

    return cost



In [3]:
def gradient_descent(X, Y, W, alpha, iterations):

    # Store cost value for each iteration
    cost_history = [0] * iterations

    # Number of training examples
    m = len(Y)

    for iteration in range(iterations):

        # Step 1: Compute predicted values
        Y_pred = np.matmul(X, W)

        # Step 2: Calculate error (prediction - actual)
        loss = Y_pred - Y

        # Step 3: Compute gradient
        dw = (1 / m) * np.matmul(X.T, loss)

        # Step 4: Update weights
        W = W - alpha * dw

        # Step 5: Calculate cost with updated weights
        cost_history[iteration] = cost_function(X, Y, W)

    return W, cost_history


In [5]:
# Model Evaluation - RMSE
def rmse(Y, Y_pred):

    # Calculate squared error
    loss = (Y - Y_pred) ** 2

    # Compute root mean squared error
    rmse_value = np.sqrt(sum(loss) / len(Y))

    return rmse_value


In [6]:
# Model Evaluation - RÂ²
def r2(Y, Y_pred):

    # Compute mean of actual values
    mean_y = np.mean(Y)

    # Total sum of squares
    ss_tot = (Y - mean_y) ** 2

    # Residual sum of squares
    ss_res = (Y - Y_pred) ** 2

    # Compute R-squared score
    r2_value = 1 - (sum(ss_res) / sum(ss_tot))

    return r2_value


In [7]:
def main():

    # Step 1: Load dataset
    data = pd.read_csv('student.csv')

    # Step 2: Separate features (X) and target (Y)
    X = data[['Math', 'Reading']].values     # Input features
    Y = data['Writing'].values               # Output target

    # Step 3: Split data into training and testing sets
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.2, random_state=42
    )

    # Step 4: Initialize weights, learning rate, and iterations
    W = np.zeros(X_train.shape[1])   # Initialize weights to zero
    alpha = 0.0001                   # Learning rate
    iterations = 1000                # Number of iterations

    # Step 5: Train model using gradient descent
    W_optimal, cost_history = gradient_descent(
        X_train, Y_train, W, alpha, iterations
    )

    # Step 6: Predict on test data
    Y_pred = np.dot(X_test, W_optimal)

    # Step 7: Evaluate model performance
    model_rmse = rmse(Y_test, Y_pred)
    model_r2 = r2(Y_test, Y_pred)

    # Step 8: Display results
    print("Final Weights:", W_optimal)
    print("Cost History (First 10 iterations):", cost_history[:10])
    print("RMSE on Test Set:", model_rmse)
    print("R-Squared on Test Set:", model_r2)


# Run the program
if __name__ == "__main__":
    main()


Final Weights: [0.10797386 0.88521888]
Cost History (First 10 iterations): [np.float64(19.24614259802574), np.float64(17.713592777548037), np.float64(17.657533670638085), np.float64(17.602841761264234), np.float64(17.548621706343425), np.float64(17.49486890572484), np.float64(17.44157933241292), np.float64(17.38874899443804), np.float64(17.336373934232473), np.float64(17.28445022833376)]
RMSE on Test Set: 4.568899888949264
R-Squared on Test Set: 0.9082364822828619
