In [None]:


def data_process(data):
    X = data.drop("G3", axis=1)
    y = data["G3"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    X_train_preprocessed = pd.get_dummies(X_train, dtype=int)
    X_test_preprocessed = pd.get_dummies(X_test,  dtype=int)
    
    untuned_results = []
    df_pred = []
    models = {
        'RandomForestRegressor': RandomForestRegressor(),
        'DecisionTreeRegressor': DecisionTreeRegressor(),
        'Ridge Regressor': Ridge(),
        'LinearRegression': LinearRegression()
    }

    # Train and evaluate models with k-fold cross-validation
    for model_name, model in models.items():
        # Cross-validation
        cv_scores = cross_val_score(model, X_train_preprocessed, y_train, cv=5, scoring='neg_mean_squared_error')
        cv_mse_mean = -cv_scores.mean()

        # Train the model on the full training set
        model.fit(X_train_preprocessed, y_train)

        # Evaluate on the test set
        pred = model.predict(X_test_preprocessed)
        mse = mean_squared_error(y_test, pred)
        r2 = r2_score(y_test, pred)
        mean_absolute_error_s = mean_absolute_error(y_test, pred)

        # Store results in the DataFrame
        untuned_results.append({
            'Model': model_name,
            'CV Mean Squared Error': cv_mse_mean,
            'Mean Squared Error': mse,
            'R2 Score': r2,
            'Mean Absolute Error': mean_absolute_error_s
        })
        df_pred.append({
            'Model': model_name,
            'Prediction': pred.tolist(),
            'Actuals': y_test.tolist()
        })

    df_pred_df = pd.DataFrame(df_pred)
    results_df = pd.DataFrame(untuned_results)

    # Transpose the DataFrame for easier plotting
    untuned_results_transposed = results_df.set_index('Model').transpose()

    # Set the style for the plot (optional)
    sns.set(style="whitegrid")

    # Create subplots for CV MSE, MSE, R2 Score, and MAE
    fig, axes = plt.subplots(1, 4, figsize=(20, 5), sharey=True)

    # Plot CV MSE for each model
    sns.barplot(data=untuned_results_transposed.loc[['CV Mean Squared Error']], ax=axes[0], color='purple')
    axes[0].set_title('CV Mean Squared Error')
    axes[0].set_ylabel('CV Mean Squared Error')
    axes[0].tick_params(axis='x', rotation=45)
    # Plot MSE for each model
    sns.barplot(data=untuned_results_transposed.loc[['Mean Squared Error']], ax=axes[1], color='green')
    axes[1].set_title('Mean Squared Error')
    axes[1].set_ylabel('Mean Squared Error')
    axes[1].tick_params(axis='x', rotation=45)
    # Plot R2 Score for each model
    sns.barplot(data=untuned_results_transposed.loc[['R2 Score']], ax=axes[2], color='blue')
    axes[2].set_title('R2 Score')
    axes[2].set_ylabel('R2 Score')
    axes[2].tick_params(axis='x', rotation=45)
    # Plot MAE for each model
    sns.barplot(data=untuned_results_transposed.loc[['Mean Absolute Error']], ax=axes[3], color='red')
    axes[3].set_title('Mean Absolute Error')
    axes[3].set_ylabel('Mean Absolute Error')
    axes[3].tick_params(axis='x', rotation=45)

    plt.tight_layout()
    plt.show()

    return results_df


In [None]:
num_features = student_df.select_dtypes(exclude=['object'])
categorical_features = student_df.select_dtypes(include=['object'])

# Reset the index of num_features
num_features_reset = num_features.reset_index(drop=True)

enc = OneHotEncoder(handle_unknown='ignore')
transformed_features = enc.fit_transform(categorical_features)
transformed_df_cat = pd.DataFrame(transformed_features.toarray(), columns=enc.get_feature_names_out(categorical_features.columns))
transformed_df_cat = transformed_df_cat.astype(int)

# Concatenate the DataFrames with the reset index
encoded_df = pd.concat([num_features_reset, transformed_df_cat], axis=1)
# Concatenate the DataFrames with the reset index
encoded_df