In [1]:
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectFromModel
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV


In [2]:
student_df = pd.read_csv("data.csv",delimiter=';')
print(student_df)

    school sex  age address famsize Pstatus  Medu  Fedu      Mjob      Fjob  \
0       GP   F   16       U     LE3       T     4     3   teacher  services   
1       GP   M   18       U     LE3       T     1     1     other     other   
2       GP   M   17       R     LE3       A     4     4   teacher     other   
3       GP   F   15       U     LE3       T     3     2  services     other   
4       GP   M   16       U     GT3       T     2     3     other     other   
..     ...  ..  ...     ...     ...     ...   ...   ...       ...       ...   
311     GP   M   15       U     LE3       A     2     1  services     other   
312     GP   F   17       U     GT3       A     4     4     other   teacher   
313     GP   F   15       U     GT3       T     4     4   teacher   teacher   
314     MS   F   19       R     GT3       T     2     3  services     other   
315     GP   F   16       U     GT3       T     4     3     other   at_home   

     ... famrel freetime  goout  Dalc  Walc health 

In [3]:
num_features = student_df.select_dtypes(exclude=['object'])
categorical_features = student_df.select_dtypes(include=['object'])


num_features_reset = num_features.reset_index(drop=True)

# One-hot encode categorical features
enc = OneHotEncoder(handle_unknown='ignore')
transformed_features = enc.fit_transform(categorical_features)
transformed_df_cat = pd.DataFrame(transformed_features.toarray(), columns=enc.get_feature_names_out(categorical_features.columns))
transformed_df_cat = transformed_df_cat.astype(int)

# Combine numerical and encoded categorical features
encoded= pd.concat([num_features_reset, transformed_df_cat], axis=1)

encoded

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,activities_no,activities_yes,nursery_no,nursery_yes,higher_no,higher_yes,internet_no,internet_yes,romantic_no,romantic_yes
0,16,4,3,3,2,0,5,4,3,1,...,0,1,0,1,0,1,0,1,1,0
1,18,1,1,1,1,3,2,3,5,2,...,1,0,0,1,1,0,0,1,0,1
2,17,4,4,2,2,0,3,3,3,2,...,1,0,0,1,0,1,0,1,1,0
3,15,3,2,1,2,0,4,4,4,1,...,1,0,0,1,0,1,0,1,1,0
4,16,2,3,2,1,0,5,3,3,1,...,1,0,0,1,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311,15,2,1,4,1,3,4,5,5,2,...,1,0,0,1,0,1,0,1,1,0
312,17,4,4,2,2,0,4,1,4,1,...,1,0,0,1,0,1,1,0,1,0
313,15,4,4,2,1,0,4,3,2,1,...,0,1,0,1,0,1,0,1,1,0
314,19,2,3,1,3,1,5,4,2,1,...,0,1,1,0,0,1,0,1,1,0


In [4]:
X = encoded.drop(['G1', 'G2', 'G3'], axis=1)
y = student_df["G3"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
models = [RandomForestRegressor(), Ridge(), LinearRegression(), DecisionTreeRegressor()]

for model in models:
    model_fit = model.fit(X_train, y_train)
    pred = model.predict(X_test)
    mse = mean_squared_error(y_test, pred)
    r2 = r2_score(y_test, pred)
    mean_absolute_error_s = mean_absolute_error(y_test, pred)
    df_pred = pd.DataFrame(y_test)
    df_pred['predictions'] = pred

    # Print the head of df_pred
    print(model)
    print(df_pred.head())
    print("\n")
    print(f"MSE: {mse}")
    print(f"R2 Score: {r2}")
    print(f"MAE:{mean_absolute_error_s}")
    print("\n")

RandomForestRegressor()
     G3  predictions
173   9        11.47
33   16        11.46
165  11         8.95
78   15        10.15
93   14        10.75


MSE: 15.183381249999998
R2 Score: 0.145902223442972
MAE:3.1178125000000003


Ridge()
     G3  predictions
173   9    11.840575
33   16     9.754635
165  11     9.008911
78   15     8.789416
93   14    11.218233


MSE: 20.384421605728026
R2 Score: -0.1466674572143376
MAE:3.698524672176521


LinearRegression()
     G3  predictions
173   9    11.903931
33   16     9.703979
165  11     8.955200
78   15     8.760132
93   14    11.292603


MSE: 20.501657395623624
R2 Score: -0.1532622219662756
MAE:3.7068214416503906


DecisionTreeRegressor()
     G3  predictions
173   9         15.0
33   16         20.0
165  11          8.0
78   15          9.0
93   14         11.0


MSE: 32.15625
R2 Score: -0.808858064959143
MAE:4.1875




In [6]:
data_encoded=encoded.drop(['G1','G2'], axis=1)
correlation_matrix = data_encoded.corr()

# Extract the correlation of 'G3' with other features
correlation_with_G3 = correlation_matrix['G3'].sort_values(ascending=False)

# Display the correlation coefficients
print(correlation_with_G3)

G3                   1.000000
Medu                 0.252612
Fedu                 0.209600
higher_yes           0.188371
romantic_no          0.137925
Mjob_health          0.124926
sex_M                0.117246
reason_other         0.109457
Fjob_teacher         0.107028
Mjob_services        0.101284
address_U            0.092704
reason_reputation    0.085038
paid_yes             0.080677
internet_yes         0.077523
schoolsup_no         0.076320
guardian_father      0.069432
Pstatus_A            0.066106
Fjob_health          0.057712
Mjob_teacher         0.057098
studytime            0.056518
freetime             0.054545
famrel               0.041200
school_GP            0.041056
absences             0.039913
famsize_LE3          0.036651
activities_yes       0.030677
Fjob_services        0.020330
Fjob_at_home         0.013623
nursery_yes          0.011829
famsup_no            0.008575
reason_home         -0.004260
famsup_yes          -0.008575
nursery_no          -0.011829
guardian_m

In [7]:
rf_model = RandomForestRegressor()

# Fit the model to your data
rf_model.fit(X, y)


# Create a selector object that will use the feature importances to select features
sfm = SelectFromModel(rf_model, threshold=0.01)  #adjust the threshold 
selected_feature_indices = sfm.get_support(indices=True)
selected_features = X.columns[selected_feature_indices]
print("Selected Features:")
print(selected_features)
sfm.fit(X, y)

# Transform feature matrix to include only selected features
X_selected = sfm.transform(X)

Selected Features:
Index(['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel',
       'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'sex_F',
       'sex_M', 'Mjob_other', 'Mjob_services', 'Fjob_teacher', 'reason_course',
       'reason_reputation', 'guardian_other', 'schoolsup_yes'],
      dtype='object')


In [8]:


X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)


In [9]:
model = RandomForestRegressor()  
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f'R2 Score: {r2}')
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')

R2 Score: 0.14248560873446403
Mean Squared Error: 15.24411875
Mean Absolute Error: 3.1028124999999998


In [10]:
randomForest_param_grid={
        'n_estimators':[100,200,500],  #default 100
        'criterion':['squared_error','absolute_error'], #default squared_error
        'min_samples_split':[2,3,4,5], #default 2
        'min_samples_leaf':[1,2,4,5], #default=1
        'max_leaf_nodes':[4,10,20,50,None] #default=None
    }
rf_model = RandomForestRegressor()

    # Grid search for RandomForestRegressor
grid_search = GridSearchCV(rf_model, randomForest_param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

    # Get best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

    # Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

param_results = {
'best_params': best_params,
'best_model': best_model,
'mse': mse,
'r2': r2,
    }
print("Best Parameters:", param_results['best_params'])
print("Best Model:", param_results['best_model'])
print("Mean Squared Error (MSE):", param_results['mse'])
print("R-squared (R2):", param_results['r2'])

Best Parameters: {'criterion': 'squared_error', 'max_leaf_nodes': 50, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 100}
Best Model: RandomForestRegressor(max_leaf_nodes=50, min_samples_split=3)
Mean Squared Error (MSE): 15.756878389650577
R-squared (R2): 0.11364177869932346
