#TO-DO 1: Data Understanding & Preparation

In [15]:
#TO-do task 1:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data =pd.read_csv("/content/drive/MyDrive/student.csv")

In [16]:
#Print top 5 and bottom 5 rows
print("Top 5 rows:")
print(data.head())

print("\nBottom 5 rows:")
print(data.tail())

Top 5 rows:
   Math  Reading  Writing
0    48       68       63
1    62       81       72
2    79       80       78
3    76       83       79
4    59       64       62

Bottom 5 rows:
     Math  Reading  Writing
995    72       74       70
996    73       86       90
997    89       87       94
998    83       82       78
999    66       66       72


In [17]:
#Dataset information
print("\nDataset Info:")
data.info()


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Math     1000 non-null   int64
 1   Reading  1000 non-null   int64
 2   Writing  1000 non-null   int64
dtypes: int64(3)
memory usage: 23.6 KB


In [31]:
#Descriptive statistics
print("\nDataset Description:")
print(data.describe())


Dataset Description:
              Math      Reading      Writing
count  1000.000000  1000.000000  1000.000000
mean     67.290000    69.872000    68.616000
std      15.085008    14.657027    15.241287
min      13.000000    19.000000    14.000000
25%      58.000000    60.750000    58.000000
50%      68.000000    70.000000    69.500000
75%      78.000000    81.000000    79.000000
max     100.000000   100.000000   100.000000


In [19]:
#Split Features (X) and Label (Y)
X = data[['Math', 'Reading']].values   # Features
Y = data['Writing'].values             # Target

#TO-DO 2: Matrix Representation (No Bias Term)

In [20]:
#To-Do-2: Create Matrices (No Bias Term)
# X → (n_samples × n_features)
# W → (n_features,)
# Y → (n_samples,)

print("Shape of X:", X.shape)
print("Shape of Y:", Y.shape)

Shape of X: (1000, 2)
Shape of Y: (1000,)


#TO-DO 3: Train-Test Split (From Scratch)

In [21]:
#To-Do-3: Train-Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

#TO-DO 4: Cost Function (MSE)

In [22]:
#To-Do-4: Cost Function Implementation
def cost_function(X, Y, W):
    """
    Computes Mean Squared Error Cost
    """
    n = len(Y)
    Y_pred = np.dot(X, W)
    cost = (1 / (2 * n)) * np.sum((Y_pred - Y) ** 2)
    return cost

#TO-DO 5: Cost Function Test Case

In [29]:
#To-Do-5: Test Cost Function
X_test = np.array([[1, 2], [3, 4], [5, 6]])
Y_test = np.array([3, 7, 11])
W_test = np.array([1, 1])

cost = cost_function(X_test, Y_test, W_test)

if cost == 0:
    print("Proceed Further")
else:
    print("Something went wrong")

print("Cost:", cost)

Proceed Further
Cost: 0.0


#TO-DO 6: Gradient Descent from Scratch

In [24]:
#To-Do-6: Gradient Descent Implementation
def gradient_descent(X, Y, W, alpha, iterations):
    cost_history = []
    m = len(Y)

    for i in range(iterations):
        #  Prediction
        Y_pred = np.dot(X, W)

        #  Error
        loss = Y_pred - Y

        #  Gradient
        dw = (1 / m) * np.dot(X.T, loss)

        #  Update Weights
        W = W - alpha * dw

        #  Cost
        cost = cost_function(X, Y, W)
        cost_history.append(cost)

    return W, cost_history

#TO-DO 7: Gradient Descent Test Case

In [25]:
#To-Do-7: Test Gradient Descent
np.random.seed(0)

X = np.random.rand(100, 3)
Y = np.random.rand(100)
W = np.random.rand(3)

alpha = 0.01
iterations = 1000

final_params, cost_history = gradient_descent(X, Y, W, alpha, iterations)

print("Final Parameters:", final_params)
print("First 10 Cost values:", cost_history[:10])

Final Parameters: [0.20551667 0.54295081 0.10388027]
First 10 Cost values: [np.float64(0.10711197094660153), np.float64(0.10634880599939901), np.float64(0.10559826315680618), np.float64(0.10486012948320558), np.float64(0.1041341956428534), np.float64(0.10342025583900626), np.float64(0.1027181077540776), np.float64(0.1020275524908062), np.float64(0.10134839451441931), np.float64(0.1006804415957737)]


#TO-DO 8: RMSE Implementation

In [34]:
#To-Do-8: RMSE Implementation
def rmse(Y, Y_pred):
    return np.sqrt(np.mean((Y - Y_pred) ** 2))

#TO-DO 9: R² Score Implementation

In [33]:
#To-Do-9: R-Squared Implementation
def r2(Y, Y_pred):
    mean_y = np.mean(Y)
    ss_tot = np.sum((Y - mean_y) ** 2)
    ss_res = np.sum((Y - Y_pred) ** 2)
    return 1 - (ss_res / ss_tot)

#TO-DO 10: Main Function (Complete Workflow)

In [28]:
#To-Do-10: Integrate Everything
def main():
    # Load Data
    data = pd.read_csv("/content/drive/MyDrive/student.csv")

    X = data[['Math', 'Reading']].values
    Y = data['Writing'].values

    # Train-Test Split
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.2, random_state=42
    )

    # Initialize
    W = np.zeros(X_train.shape[1])
    alpha = 0.00001
    iterations = 1000

    # Train Model
    W_optimal, cost_history = gradient_descent(
        X_train, Y_train, W, alpha, iterations
    )

    # Predictions
    Y_pred = np.dot(X_test, W_optimal)

    # Evaluation
    print("Final Weights:", W_optimal)
    print("First 10 Cost Values:", cost_history[:10])
    print("RMSE:", rmse(Y_test, Y_pred))
    print("R2 Score:", r2(Y_test, Y_pred))


if __name__ == "__main__":
    main()

Final Weights: [0.34811659 0.64614558]
First 10 Cost Values: [np.float64(2013.165570783755), np.float64(1640.286832599692), np.float64(1337.0619994901588), np.float64(1090.4794892850578), np.float64(889.9583270083234), np.float64(726.8940993009545), np.float64(594.2897260808594), np.float64(486.4552052951635), np.float64(398.7634463599484), np.float64(327.4517147324688)]
RMSE: 5.2798239764188635
R2 Score: 0.8886354462786421
