In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score

# --- 1. Load the Dataset ---
# We use Pandas to read the csv file as mentioned in the report[cite: 68].
# Ensure 'Student_Performance.csv' is in your directory.
df = pd.read_csv('/content/Student_Performance.csv')

# --- 2. Data Preprocessing ---

# A. One-Hot Encoding
# Converting "Extracurricular Activities" (Yes/No) into binary (0/1)[cite: 71, 72].
# The code snippet from the report uses drop_first=True[cite: 74].
df = pd.get_dummies(df, columns=['Extracurricular Activities'], drop_first=True)

# B. Standardization (Scaling)
# Scaling "Hours Studied" and "Previous Scores" because of their different value ranges[cite: 78, 80].
scaler = StandardScaler() # [cite: 83]
cols_to_scale = ['Hours Studied', 'Previous Scores'] # [cite: 84]

# Applying the scaler to the specific columns [cite: 85]
df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])

# --- 3. Splitting the Data ---
# Separating the predictors (X) and the target variable (y - Performance Index)[cite: 33, 66].
X = df.drop(columns=['Performance Index'])
y = df['Performance Index']

# Splitting into Training (80%) and Testing (20%) sets[cite: 111, 112].
# random_state=42 ensures the split is the same every time you run it[cite: 119].
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 4. Algorithm & Training ---
# Initializing the Linear Regression model[cite: 120].
model = LinearRegression()

# Training the model using the training data[cite: 121].
model.fit(X_train, y_train)

# --- 5. Prediction ---
# Predicting the results using the testing data[cite: 122].
y_pred = model.predict(X_test)

# --- 6. Evaluation ---
# Calculating Mean Absolute Error (MAE)[cite: 135].
mae = mean_absolute_error(y_test, y_pred)

# Calculating R-Squared Score[cite: 137].
r2 = r2_score(y_test, y_pred)

# --- 7. Output Results ---
print("Model Evaluation Results:")
print("-------------------------")
print(f"Mean Absolute Error (MAE): {mae:.2f}") # Formatted to 2 decimal places
print(f"R-Squared Score (R2): {r2:.4f}")      # Formatted to 4 decimal places

# Comparison check (Optional: based on report conclusion values)
print("\nExpected from Report:")
print("MAE should be around 0.85 [cite: 141]")
print("R2 should be around 0.99 [cite: 141]")

Model Evaluation Results:
-------------------------
Mean Absolute Error (MAE): 1.61
R-Squared Score (R2): 0.9890

Expected from Report:
MAE should be around 0.85 [cite: 141]
R2 should be around 0.99 [cite: 141]


In [2]:
import joblib

# Save the model and the scaler to files
joblib.dump(model, 'lr_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

print("Files saved: 'lr_model.pkl' and 'scaler.pkl'")
print("Please download these two files from the Colab files tab.")

Files saved: 'lr_model.pkl' and 'scaler.pkl'
Please download these two files from the Colab files tab.
