In [15]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score
import pickle

In [16]:
data= pd.read_csv('../data/data.csv')
data.head(10)

Unnamed: 0,Age,Grade_Level,Usage_of_VR_in_Education,Hours_of_VR_Usage_Per_Week,Engagement_Level,Improvement_in_Learning_Outcomes,Instructor_VR_Proficiency,Access_to_VR_Equipment,Stress_Level_with_VR_Usage,Collaboration_with_Peers_via_VR
0,13,Postgraduate,Yes,8,5,Yes,Beginner,No,Low,Yes
1,16,Undergraduate,No,3,1,No,Intermediate,Yes,Low,No
2,15,High School,Yes,0,1,Yes,Advanced,Yes,Low,No
3,24,Postgraduate,Yes,10,4,Yes,Advanced,No,Low,Yes
4,22,Undergraduate,Yes,10,4,No,Intermediate,Yes,Low,Yes
5,28,High School,No,4,1,No,Advanced,Yes,Medium,Yes
6,19,Undergraduate,Yes,5,1,Yes,Intermediate,Yes,Low,No
7,19,High School,Yes,10,5,Yes,Beginner,No,Medium,Yes
8,29,Undergraduate,Yes,3,1,Yes,Advanced,Yes,Medium,No
9,16,Postgraduate,No,11,4,Yes,Intermediate,Yes,Low,Yes


In [17]:
data.shape

(4841, 10)

# Preprocessing Plan for Model Input

## Categorical Encoding
1. **Label Encoded Columns**:
   - **`Grade_Level`**
   - **`Usage_of_VR_in_Education`**
   - **`Improvement_in_Learning_Outcomes`**
   - **`Fuel Instructor_VR_Proficiency`**
   - **`Access_to_VR_Equipment`**
   - **`Stress_Level_with_VR_Usage`**
   - **`Collaboration_with_Peers_via_VR`**

In [18]:
categorical_columns = ['Grade_Level', 'Usage_of_VR_in_Education', 'Improvement_in_Learning_Outcomes', 'Instructor_VR_Proficiency','Access_to_VR_Equipment','Stress_Level_with_VR_Usage','Collaboration_with_Peers_via_VR']
label_encoders = {}

In [19]:
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le
data.head(10)

Unnamed: 0,Age,Grade_Level,Usage_of_VR_in_Education,Hours_of_VR_Usage_Per_Week,Engagement_Level,Improvement_in_Learning_Outcomes,Instructor_VR_Proficiency,Access_to_VR_Equipment,Stress_Level_with_VR_Usage,Collaboration_with_Peers_via_VR
0,13,1,1,8,5,1,1,0,1,1
1,16,2,0,3,1,0,2,1,1,0
2,15,0,1,0,1,1,0,1,1,0
3,24,1,1,10,4,1,0,0,1,1
4,22,2,1,10,4,0,2,1,1,1
5,28,0,0,4,1,0,0,1,2,1
6,19,2,1,5,1,1,2,1,1,0
7,19,0,1,10,5,1,1,0,2,1
8,29,2,1,3,1,1,0,1,2,0
9,16,1,0,11,4,1,2,1,1,1


In [20]:
# Save the label encoders for future use
with open('../models/label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

## Stratified Split
- **`Hours_of_VR_Usage_Per_Week_Cat`**


In [24]:
data['Hours_of_VR_Usage_Per_Week_Cat'] = pd.cut(
    data['Hours_of_VR_Usage_Per_Week'],
    bins=[-1,0, 10.0, 20,30.0,  np.inf], 
    labels=[1, 2, 3,4,5]          
)

In [25]:
data.head(10)

Unnamed: 0,Age,Grade_Level,Usage_of_VR_in_Education,Hours_of_VR_Usage_Per_Week,Engagement_Level,Improvement_in_Learning_Outcomes,Instructor_VR_Proficiency,Access_to_VR_Equipment,Stress_Level_with_VR_Usage,Collaboration_with_Peers_via_VR,Hours_of_VR_Usage_Per_Week_Cat
0,13,1,1,8,5,1,1,0,1,1,2
1,16,2,0,3,1,0,2,1,1,0,2
2,15,0,1,0,1,1,0,1,1,0,1
3,24,1,1,10,4,1,0,0,1,1,2
4,22,2,1,10,4,0,2,1,1,1,2
5,28,0,0,4,1,0,0,1,2,1,2
6,19,2,1,5,1,1,2,1,1,0,2
7,19,0,1,10,5,1,1,0,2,1,2
8,29,2,1,3,1,1,0,1,2,0,2
9,16,1,0,11,4,1,2,1,1,1,3


In [26]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(data, data['Hours_of_VR_Usage_Per_Week_Cat']):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

print("Stratified Split Successful!")

Stratified Split Successful!


In [27]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("Hours_of_VR_Usage_Per_Week_Cat", axis=1, inplace=True)
strat_train_set

Unnamed: 0,Age,Grade_Level,Usage_of_VR_in_Education,Hours_of_VR_Usage_Per_Week,Engagement_Level,Improvement_in_Learning_Outcomes,Instructor_VR_Proficiency,Access_to_VR_Equipment,Stress_Level_with_VR_Usage,Collaboration_with_Peers_via_VR
3922,19,0,1,0,1,0,0,1,2,0
2934,19,1,0,4,1,0,0,1,0,1
4664,28,2,1,3,1,0,0,0,2,0
647,17,1,1,1,1,0,1,1,0,0
3428,14,2,1,2,1,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
4501,22,1,1,20,5,0,0,1,0,1
2406,20,2,0,0,1,0,1,0,2,0
1591,12,0,1,4,3,1,1,1,2,0
1955,15,1,0,1,1,0,2,0,1,1


## Numerical Scaling
- Scale the following numerical columns:
  - **`Age`**
  - **`Hours_of_VR_Usage_Per_Week`**


In [28]:
X = strat_train_set.drop("Engagement_Level", axis=1)
Y = strat_train_set["Engagement_Level"].copy()


In [29]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(3097, 9)
(3097,)
(775, 9)
(775,)


In [30]:
# Scale numerical features
numerical_columns = ['Age', 'Hours_of_VR_Usage_Per_Week']
scaler = StandardScaler()
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

In [31]:
# Save the scaler
with open('../models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

## Select and Train a Model

- RMSE :أصغر قيمة ممكنة (تقترب من الصفر) تعتبر قيمة مثالية
- (R²): أقرب قيمة إلى 1 تعتبر قيمة مثالية 

In [48]:
def model_perofrmance(y_true,y_pred):
    print("\nModel Performance:")
    #تُحسب هذه الدالة متوسط المربعات للفروقات بين القيم الحقيقية (y_true) والقيم المتوقعة (y_pred).
    mse = mean_squared_error(y_true, y_pred)
    print("MSE:",mse)

    #يقيس متوسط الخطأ ولكن بوحدات البيانات الأصلية
    rmse = np.sqrt(mse)
    print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

    #يقيس مقدار التباين في البيانات الذي يتم تفسيره بواسطة النموذج.
    r2 = r2_score(y_true, y_pred)
    print(f"R-squared (R²): {r2:.2f}")

### Random Forest Regressor

In [49]:
rfr_model = RandomForestRegressor(n_estimators=100, random_state=42)
rfr_model.fit(X_train, Y_train)

In [35]:
some_data = X_train.iloc[:20]
some_labels = Y_train.iloc[:20]

In [58]:
# Evaluate the model
y_pred=rfr_model.predict(some_data)
model_perofrmance(some_labels,y_pred)


Model Performance:
MSE: 0.0333755611111111
Root Mean Squared Error (RMSE): 0.18
R-squared (R²): 0.98


In [50]:
# Evalute the train data
y_pred=rfr_model.predict(X_train)
model_perofrmance(Y_train,y_pred)


Model Performance:
MSE: 0.06744917703018136
Root Mean Squared Error (RMSE): 0.26
R-squared (R²): 0.97


In [51]:
# Evalute the test data
y_pred=rfr_model.predict(X_test)
model_perofrmance(Y_test,y_pred)


Model Performance:
MSE: 0.40244070594470044
Root Mean Squared Error (RMSE): 0.63
R-squared (R²): 0.81


### LinearRegression

In [52]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, Y_train)

In [53]:
y_pred = lin_reg.predict(X_train)
model_perofrmance(Y_train,y_pred)


Model Performance:
MSE: 0.5979745157923904
Root Mean Squared Error (RMSE): 0.77
R-squared (R²): 0.73


In [54]:
y_pred = lin_reg.predict(X_test)
model_perofrmance(Y_test,y_pred)


Model Performance:
MSE: 0.5425373243079804
Root Mean Squared Error (RMSE): 0.74
R-squared (R²): 0.74


### DecisionTreeRegressor

In [55]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, Y_train)

In [56]:
y_pred = tree_reg.predict(X_train)
model_perofrmance(Y_train,y_pred)


Model Performance:
MSE: 0.011301259283177269
Root Mean Squared Error (RMSE): 0.11
R-squared (R²): 0.99


In [57]:
y_pred = tree_reg.predict(X_test)
model_perofrmance(Y_test,y_pred)


Model Performance:
MSE: 0.7083870967741935
Root Mean Squared Error (RMSE): 0.84
R-squared (R²): 0.66


In [60]:
y_pred=tree_reg.predict(some_data)
model_perofrmance(some_labels,y_pred)


Model Performance:
MSE: 0.0
Root Mean Squared Error (RMSE): 0.00
R-squared (R²): 1.00


### GradientBoostingRegressor

In [67]:
from sklearn.ensemble import GradientBoostingRegressor

gbr_model = GradientBoostingRegressor(
    n_estimators=100,       # عدد الأشجار
    learning_rate=0.1,      # معدل التعلم
    max_depth=3,            # العمق الأقصى للأشجار
    random_state=42         # تثبيت العشوائية
)

gbr_model.fit(X_train, Y_train)

In [68]:
y_pred=gbr_model.predict(X_train)
model_perofrmance(Y_train,y_pred)


Model Performance:
MSE: 0.333169838112801
Root Mean Squared Error (RMSE): 0.58
R-squared (R²): 0.85


### Evaluation Using Cross-Validation

In [62]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, X_train, Y_train,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)


In [63]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)

Scores: [0.85995124 0.91713087 0.84957295 0.88900089 0.83521293 0.8935251
 0.79969752 0.83897765 0.85379412 0.86415494]
Mean: 0.8601018202474606
Standard deviation: 0.031758534297253935


In [64]:
scores = cross_val_score(rfr_model, X_train, Y_train,
                         scoring="neg_mean_squared_error", cv=10)
rfr_rmse_scores = np.sqrt(-scores)
display_scores(rfr_rmse_scores)

Scores: [0.62623697 0.6724942  0.67293418 0.66617762 0.65185994 0.68683216
 0.65474529 0.6345049  0.64172578 0.64140297]
Mean: 0.6548914013094073
Standard deviation: 0.018396085432898748


In [65]:
scores = cross_val_score(lin_reg, X_train, Y_train,
                         scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-scores)
display_scores(lin_rmse_scores)

Scores: [0.82722372 0.71607682 0.79060221 0.76054755 0.8675173  0.78041235
 0.74572715 0.73201938 0.76476199 0.77239897]
Mean: 0.7757287430247438
Standard deviation: 0.042486245878936874


In [69]:
scores = cross_val_score(gbr_model, X_train, Y_train,
                         scoring="neg_mean_squared_error", cv=10)
gbr_rmse_scores = np.sqrt(-scores)
display_scores(gbr_rmse_scores)

Scores: [0.5724167  0.60571557 0.59786887 0.61829853 0.6069959  0.61745141
 0.59124039 0.58360111 0.59777588 0.59344824]
Mean: 0.5984812598561492
Standard deviation: 0.013626588992936747


### Improving Gradient Boosting 
- GridSearchCV 

In [71]:
from sklearn.model_selection import GridSearchCV
gbr_model = GradientBoostingRegressor(random_state=42)

# معلمات مختلفة لتجربتها
param_grid = {
    'n_estimators': [50, 100, 200],         # عدد الأشجار
    'learning_rate': [0.01, 0.1, 0.2],      # معدل التعلم
    'max_depth': [3, 5, 7],                  # عمق الشجرة
    'subsample': [0.8, 0.9, 1.0],            # نسبة العينة لكل شجرة
    'min_samples_split': [2, 5, 10],         # الحد الأدنى لعدد العينات لتقسيم العقدة
    'min_samples_leaf': [1, 2, 4]            # الحد الأدنى لعدد العينات في الأوراق
}

# GridSearchCV لتجربة مجموعة من المعلمات
grid_search = GridSearchCV(estimator=gbr_model, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', cv=10, n_jobs=-1)

# تدريب GridSearchCV
grid_search.fit(X_train, Y_train)

In [72]:
best_model = grid_search.best_estimator_
y_pred =best_model.predict(X_train)
model_perofrmance(Y_train,y_pred)


Model Performance:
MSE: 0.343721030366068
Root Mean Squared Error (RMSE): 0.59
R-squared (R²): 0.84


In [73]:
scores = cross_val_score(best_model, X_train, Y_train,
                         scoring="neg_mean_squared_error", cv=10)
best_rmse_scores = np.sqrt(-scores)
display_scores(gbr_rmse_scores)

Scores: [0.5724167  0.60571557 0.59786887 0.61829853 0.6069959  0.61745141
 0.59124039 0.58360111 0.59777588 0.59344824]
Mean: 0.5984812598561492
Standard deviation: 0.013626588992936747


### Save Model

In [74]:
import joblib
joblib.dump(tree_reg, "../models/tree_model.pkl")
joblib.dump(lin_reg, "../models/lin_model.pkl")
joblib.dump(rfr_model, "../models/rfr_model.pkl")
joblib.dump(gbr_model, "../models/gpr_model.pkl")
joblib.dump(best_model, "../models/gpr_by_GridSearchCV_model.pkl")

['../models/gpr_by_GridSearchCV_model.pkl']