<a href="https://colab.research.google.com/github/anjali-1002/FFML_Projects_and_Labs/blob/main/Project_Main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, classification_report

import joblib

# Load dataset
data = pd.read_csv('student-mat.csv')

# Encode categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Define features and target for regression
X = data.drop(['G3'], axis=1)
y = data['G3']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)

# 2. Random Forest (with GridSearchCV)
rf = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}
grid_rf = GridSearchCV(rf, param_grid, cv=3, scoring='r2', n_jobs=-1)
grid_rf.fit(X_train, y_train)
rf_best = grid_rf.best_estimator_
rf_preds = rf_best.predict(X_test)

# 3. XGBoost
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)

# Evaluate models
def evaluate_model(name, y_true, y_pred):
    print(f"{name} R² Score: {r2_score(y_true, y_pred):.4f}")
    print(f"{name} RMSE: {np.sqrt(mean_squared_error(y_true, y_pred)):.4f}")
    print("-" * 40)

print("Regression Evaluation:")
evaluate_model('Linear Regression', y_test, lr_preds)
evaluate_model('Random Forest (Best)', y_test, rf_preds)
evaluate_model('XGBoost', y_test, xgb_preds)

# Save the best model
joblib.dump(rf_best, 'best_student_performance_model.pkl')
print("Best model saved as 'best_student_performance_model.pkl'.")

# Feature Importance
importances = rf_best.feature_importances_
features = X.columns
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot Feature Importance
plt.figure(figsize=(10,6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importance from Random Forest')
plt.show()

# --------------------------------------------
# Classification Part (Pass/Fail prediction)
# --------------------------------------------

# Add a binary target (Pass if G3 >= 10, else Fail)
data['pass_fail'] = data['G3'].apply(lambda x: 1 if x >= 10 else 0)

X_cls = data.drop(['G3', 'pass_fail'], axis=1)
y_cls = data['pass_fail']

# Train-test split for classification
Xc_train, Xc_test, yc_train, yc_test = train_test_split(X_cls, y_cls, test_size=0.2, random_state=42)

# Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(Xc_train, yc_train)
yc_preds = rfc.predict(Xc_test)

# Classification Evaluation
print("\nClassification (Pass/Fail) Evaluation:")
print(f"Accuracy: {accuracy_score(yc_test, yc_preds):.4f}")
print("\nClassification Report:\n", classification_report(yc_test, yc_preds))

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.metrics import r2_score, mean_squared_error

# Load dataset
# You can replace this with your own dataset path
data = pd.read_csv('student-mat.csv')

# Quick view of data
print(data.head())

# Preprocessing
# Encode categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Define features and target
X = data.drop(['G3'], axis=1)  # G3 is the final grade
y = data['G3']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model 1: Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)

# Model 2: Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)

# Model 3: XGBoost
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)

# Evaluate Models
def evaluate_model(name, y_true, y_pred):
    print(f"{name} R² Score: {r2_score(y_true, y_pred):.4f}")
    print(f"{name} RMSE: {np.sqrt(mean_squared_error(y_true, y_pred)):.4f}")
    print("-" * 30)

evaluate_model('Linear Regression', y_test, lr_preds)
evaluate_model('Random Forest', y_test, rf_preds)
evaluate_model('XGBoost', y_test, xgb_preds)

# Feature Importance from Random Forest
importances = rf_model.feature_importances_
features = X.columns
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10,6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importance from Random Forest')
plt.show()