In [39]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

import joblib

In [37]:

# Set random seed for reproducibility
np.random.seed(42)

# Generate 500 samples
num_rows = 500

# Random scores (0-100) and times (2-15 min)
quiz_score = np.round(np.random.uniform(0, 100, num_rows))
quiz_time_taken = np.round(np.random.uniform(2, 15, num_rows))

# Calculate study hours: base formula + noise
# Lower score increases hours, higher time increases hours
study_duration = (6 - (quiz_score / 20) + (quiz_time_taken / 10)) / 2  # Adjusted to keep range 0.5-5
study_duration = np.clip(study_duration + np.round(np.random.normal(0, 0.3, num_rows)), 0.5, 5.0)  # Add noise, cap at 0.5-5

# Create DataFrame
df = pd.DataFrame({
    'quiz_score': quiz_score,
    'quiz_time_taken': quiz_time_taken,
    'study_duration': study_duration
})

print(df.head())




   quiz_score  quiz_time_taken  study_duration
0        37.0             11.0           2.625
1        95.0              9.0           1.075
2        73.0              6.0           1.475
3        60.0             13.0           2.150
4        16.0             11.0           3.150


In [42]:

data = pd.DataFrame(df)
#  Define features (X) and target (y)
X = data[['quiz_score', 'quiz_time_taken']]  # Add other features if you have them
y = data['study_duration']

#  Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#  Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 8, 10],
    'min_samples_leaf': [3, 5, 8],
    'min_samples_split': [2, 5, 10]
}



# Create the Random Forest Regressor
rf_reg = RandomForestRegressor(max_depth= 5, min_samples_leaf= 3, min_samples_split= 2, n_estimators= 200, random_state=42)

rf_reg.fit(X_train, y_train)

y_pred = rf_reg.predict(X_test)

#  Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse**0.5
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")

Mean Squared Error: 0.10769911407253754
Root Mean Squared Error: 0.3281754318539667
R-squared: 0.8231736723257029


In [43]:
# Example of making a prediction for a new student:
new_student_data = pd.DataFrame({'quiz_score': [65], 'quiz_time_taken': [10]})  # Example values
predicted_study_duration = rf_reg.predict(new_student_data)
print(f"Predicted study duration: {predicted_study_duration[0]}")



Predicted study duration: 2.0590810204427585


In [44]:


filename = 'model.joblib'
joblib.dump(rf_reg, filename)

['model.joblib']