In [None]:
import pandas as pd
df = pd.read_csv("student-mat.csv", sep=";")
print(df.head())


In [None]:
print(df.isnull().sum())


In [None]:
df = pd.get_dummies(df, drop_first=True)


In [None]:
features = ["studytime", "absences", "G1", "G2", "failures", "goout", "health"]
X = df[features]
y = df["G3"]


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LinearRegression

# Create a Linear Regression model
model = LinearRegression()

# Train (Fit) the model using training data
model.fit(X_train, y_train)

# Predict student grades on test data
y_pred = model.predict(X_test)

print("Model Training Complete! 🎉")


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Calculate MAE (lower is better)
mae = mean_absolute_error(y_test, y_pred)

# Calculate R² score (closer to 1 is better)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"R² Score: {r2}")


R² = 0.80? → The model is 80% accurate in predicting student grades
MAE = 1.5? → On average, the model is 1.5 points off when predicting grades.

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Create a Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict student grades
y_pred_rf = rf_model.predict(X_test)

# Check accuracy
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest MAE: {mae_rf}")
print(f"Random Forest R² Score: {r2_rf}")



Random Forest uses many decision trees 🌲🌲🌲 to improve predictions.
It reduces errors and improves accuracy compared to Linear Regression.

In [None]:
# Create a new student data point (in the same format as training data)
new_student = pd.DataFrame({
    "studytime": [3],
    "absences": [5],
    "G1": [14],
    "G2": [15],
    "failures": [0],
    "goout": [2],
    "health": [4]
})

# Predict Final Grade
predicted_grade = rf_model.predict(new_student)
print(f"Predicted Final Grade (G3): {predicted_grade[0]:.2f}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Scatter plot of predicted vs actual grades
plt.figure(figsize=(8,6))
sns.scatterplot(x=y_test, y=y_pred_rf)
plt.xlabel("Actual Grades (G3)")
plt.ylabel("Predicted Grades (G3)")
plt.title("Actual vs Predicted Student Grades")
plt.show()


If points fall near the diagonal line, predictions are accurate.
If points are far from the line, the model made mistakes.

In [None]:
# Get feature importance scores from Random Forest model
importances = rf_model.feature_importances_
feature_names = X.columns

# Plot feature importance
plt.figure(figsize=(8,6))
sns.barplot(x=importances, y=feature_names)
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.title("Feature Importance for Predicting Final Grade")
plt.show()


If G2 and G1 have high importance, it means previous grades strongly affect the final grade.
If absences or goout have low importance, they don’t affect grades much

✅ Tested the model on a new student.
✅ Plotted actual vs. predicted grades to see accuracy.
✅ Analyzed feature importance to understand what impacts grades the most.

In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
param_grid = {
    "n_estimators": [50, 100, 200],  # Number of trees
    "max_depth": [5, 10, None],  # Tree depth
    "min_samples_split": [2, 5, 10]  # Minimum samples per split
}

# Perform Grid Search
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, n_jobs=-1, scoring="r2")
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best Hyperparameters:", grid_search.best_params_)


The Random Forest model has hyperparameters like:

n_estimators → Number of trees 🌳
max_depth → How deep each tree goes 🌲
min_samples_split → Minimum data points needed to split a node 🔪
The model tests different hyperparameters automatically.
It finds the best combination for higher accuracy.

In [None]:
# Get best parameters from Grid Search
best_params = grid_search.best_params_

# Train a new model with the best settings
best_rf_model = RandomForestRegressor(**best_params, random_state=42)
best_rf_model.fit(X_train, y_train)

# Predict again
y_pred_best = best_rf_model.predict(X_test)

# Check new accuracy
from sklearn.metrics import mean_absolute_error, r2_score
mae_best = mean_absolute_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

print(f"Optimized Model MAE: {mae_best}")
print(f"Optimized Model R² Score: {r2_best}")


The new model is more accurate because it uses the best settings.
The R² Score should improve (closer to 1 means better predictions).

In [None]:
print(f"Original Model R²: {r2_rf}")
print(f"Optimized Model R²: {r2_best}")

print(f"Original Model MAE: {mae_rf}")
print(f"Optimized Model MAE: {mae_best}")


comparing the changes
✅ Used GridSearchCV to find the best hyperparameters.
✅ Trained a better model with the best settings.
✅ Compared old vs. new model performance.

In [None]:
pip install flask pandas xgboost scikit-learn


In [None]:
from xgboost import XGBRegressor

# Train a new model (use the best hyperparameters from your tuning)
model = XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train, y_train)  # Make sure X_train and y_train are defined

# Save the model in the correct XGBoost format
model.save_model("student_grade_model.json")  # Saves in the right format

print("Model saved successfully!")
