In [10]:
# predict_grades.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# A small, hard-coded dataset for demonstration purposes.
# In a real-world scenario, you would load this from a CSV file.
data = {
    'study_hours': [2, 3, 5, 4, 6, 7, 8, 5, 9, 10],
    'attendance_percentage': [80, 85, 95, 90, 92, 98, 100, 88, 97, 99],
    'midterm_score': [60, 65, 80, 75, 85, 90, 95, 70, 92, 98],
    'final_grade': [65, 70, 88, 80, 90, 94, 98, 75, 95, 99]
}
df = pd.DataFrame(data)

print("--- Dataframe Head ---")
print(df.head())
print("\n" + "="*50 + "\n")

# Define features (X) and target (y)
# Features are the input variables we use to make a prediction
X = df[['study_hours', 'attendance_percentage', 'midterm_score']]
# The target is the variable we want to predict
y = df['final_grade']

# Split the data into training and testing sets
# The training set is used to train the model.
# The testing set is used to evaluate the model's performance on unseen data.
# 20% of the data is reserved for testing (test_size=0.2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training Data Size:", len(X_train))
print("Testing Data Size:", len(X_test))
print("\n" + "="*50 + "\n")

# Initialize and train the Linear Regression model
# Linear Regression models the relationship between a dependent variable (y)
# and one or more independent variables (X) by fitting a linear equation.
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("--- Model Evaluation ---")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print("MSE is the average of the squared differences between predicted and actual values. Lower is better.")
print(f"R-squared (R2) Score: {r2:.2f}")
print("R-squared measures how well the model's predictions match the actual values. A score of 1.0 is a perfect match.")
print("\n" + "="*50 + "\n")

# Use the trained model to make a new prediction
print("--- Making a New Prediction ---")
new_student_data = pd.DataFrame({
    'study_hours': [6],
    'attendance_percentage': [90],
    'midterm_score': [82]
})

predicted_grade = model.predict(new_student_data)
print(f"Features for new student: {new_student_data.to_dict('records')[0]}")
print(f"The predicted final grade for the new student is: {predicted_grade[0]:.2f}")


--- Dataframe Head ---
   study_hours  attendance_percentage  midterm_score  final_grade
0            2                     80             60           65
1            3                     85             65           70
2            5                     95             80           88
3            4                     90             75           80
4            6                     92             85           90


Training Data Size: 8
Testing Data Size: 2


--- Model Evaluation ---
Mean Squared Error (MSE): 1.21
MSE is the average of the squared differences between predicted and actual values. Lower is better.
R-squared (R2) Score: 0.99
R-squared measures how well the model's predictions match the actual values. A score of 1.0 is a perfect match.


--- Making a New Prediction ---
Features for new student: {'study_hours': 6, 'attendance_percentage': 90, 'midterm_score': 82}
The predicted final grade for the new student is: 85.05
