# Group Project Report
### Group 19: Aidan Gallant, Alex Hachey, Cailey Murad, Caroline Ruus

## How Past Grades, Study Habits, and Absences Correlate to Final Math Grades

## Introduction:

Students in today's society place a great importance upon their school grades, which is why we have decided to focus our project on finding the most effective predictors of success and answering the question: to what extent does a student’s past math grades, recorded absences, and study habits correlate to their final grade? In order to answer our predictive question, we will be using a Student Performance Dataset that measures the achievement of students in two Portuguese secondary schools based on each student’s response to a survey as well as their grades in Mathematics. The survey gathered information on the students’ study time, absences, as well as many other variables. It is also noted that this dataset was modeled using binary classification and regression.

## Methods & Results:
- describe in written English the methods you used to perform your analysis from beginning to end that narrates the code the does the analysis.
- note: all tables and figure should have a figure/table number and a legend

In [None]:
library(tidyverse)
library(repr)
library(tidymodels)
library(rvest)
options(repr.matrix.max.rows = 6)

In [None]:
# load data from the original source on the web

temp <- tempfile()
temp2 <- tempfile()

url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00320/student.zip"
download.file(url, temp)

unzip(zipfile = temp, exdir = temp2)

student <- read_csv2(file.path(temp2, "student-mat.csv"))
unlink(c(temp, temp2))

In [None]:
# wrangle and clean the data from it's original (downloaded) format to the format necessary for the planned analysis

student <- student %>%
    as_tibble() %>%
    select (studytime, absences, G1, G2, G3)
student

In [None]:
# perform a summary of the data set that is relevant for exploratory data analysis related to the planned analysis 

# train the data
student_split <- initial_split(student, prop = 0.75, strata = G3)  
student_train <- training(student_split)
student_test <- testing(student_split)

student_train

# summarize the training data set
summary(student_train)

In [None]:
# create a visualization of the dataset that is relevant for exploratory data analysis related to the planned analysis

options(repr.plot.width = 10, repr.plot.height = 10)

studytime_plot <- student_train %>%
    ggplot(aes(x = studytime, fill = as_factor(G3))) +
    geom_histogram(position = "identity", binwidth = 1) +
    labs(x = "Weekly Study Time (1 to 4)", y = "Count", fill = "Final Grade (0 to 20)") +
    ggtitle("Figure 1: Student's Weekly Study Time vs Final Grade") +
    theme(text = element_text(size = 14))

studytime_plot

In [None]:
options(repr.plot.width = 20, repr.plot.height = 10)

absences_plot <- student_train %>%
    ggplot(aes(x = absences, fill = as_factor(G3))) +
    geom_histogram(position = "identity", binwidth = 1) +
    labs(x = "Number of School Absences (0 to 93)", y = "Count", fill = "Final Grade (0 to 20)") +
    ggtitle("Figure 2: Student's Number of School Absences vs Final Grade") +
    theme(text = element_text(size = 14)) #+
    #coord_flip()

absences_plot

In [None]:
options(repr.plot.width = 10, repr.plot.height = 10)

G1_plot <- student_train %>%
    ggplot(aes(x = G1, fill = as_factor (G3))) +
    geom_histogram(position = "identity", binwidth = 1) +
    labs(x = "First Period Grade (0 to 20)", y = "Count", fill = "Final Grade (0 to 20)") +
    ggtitle("Figure 3: Distribution of Student's First Period Grade") +
    theme(text = element_text(size = 14))

G1_plot

In [None]:
options(repr.plot.width = 10, repr.plot.height = 10)

G2_plot <- student_train %>%
    ggplot(aes(x = G2, fill = as_factor (G3))) +
    geom_histogram(position = "identity", binwidth = 1) +
    labs(x = "Second Period Grade (0 to 20)", y = "Count", fill = "Final Grade (0 to 20)") +
    ggtitle("Figure 4: Distribution of Second Period Grade") +
    theme(text = element_text(size = 14))

G2_plot

In [None]:
# perform the data analysis

# knn-regression with G1 as predictor

# set the seed
set.seed(1)

# create recipe to preprocess data
student_recipe <- recipe(G3 ~ G1, data = student_train) %>%
    step_scale(all_predictors()) %>%
    step_center(all_predictors())

# model specification
student_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
    set_engine("kknn") %>%
    set_mode("regression")

# 10-fold cross validation object
student_vfold <- vfold_cv(student_train, v = 10, strata = G3)

student_workflow <- workflow() %>%
    add_recipe(student_recipe) %>%
    add_model(student_spec)

student_workflow

# run cross validation for a grid of numbers of neighbors ranging from 1 to 200
gridvals <- tibble(neighbors = seq(1:200))

# tune the model and return the RMSPE for each number of neighbor
student_results <- student_workflow %>%
    tune_grid(resamples = student_vfold, grid = gridvals) %>%
    collect_metrics()

# find the minimum RMSPE
student_min <- student_results %>%
    filter(.metric == "rmse") %>%
    filter(mean == min(mean))
    #arrange(desc (mean, std_error)) %>%
    #slice (200)

student_min

# retrain model on training data using K value with smallest RMSE
k_min <- student_min %>%
    pull(neighbors)

student_best_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = k_min) %>%
    set_engine("kknn") %>%
    set_mode("regression")

student_best_fit <- workflow() %>%
    add_recipe(student_recipe) %>%
    add_model(student_best_spec) %>%
    fit(data = student_train)

# assess RMSPE of test data predictions
student_summary <- student_best_fit %>%
    predict(student_test) %>%
    bind_cols(student_test) %>%
    metrics(truth = G3, estimate = .pred)

student_summary

# create a visualization of the analysis
student_preds <- student_best_fit %>%
    predict(student_train) %>%
    bind_cols(student_train)

student_plot <- ggplot(student_preds, aes(x = G2, y = G3)) +
    geom_point() +
    geom_line(data = student_preds, mapping = aes(x = G2, y = .pred), colour = "blue") +
    labs(x = "Second Period Grade (0 to 20)", y = "Final Grade (0 to 20)") +
    theme(text = element_text(size = 14)) +
    ggtitle(paste0("Figure 5: Predicted Values of Final Grade (K = ", k_min, ")"))

student_plot

In [None]:
# linear-regression with G1 as predictor

# set the seed
set.seed(1)

# model specification
student_spec <- linear_reg() %>% 
       set_engine("lm") %>%
       set_mode("regression")

# create recipe to preprocess data
student_recipe <- recipe(G3 ~ G1, data = student_train)
 
# establish workflow
student_fit <- workflow() %>%
       add_recipe(student_recipe) %>%
       add_model(student_spec) %>%
       fit(data = student_train)

student_fit
 
# visualize model predictions
G1_linear_plot <- ggplot(student_train, aes(x = G1, y = G3)) +
    geom_point() +
    labs(x= "Term 1 Grades (0 to 20)", y = "Final Grades (0 to 20)") +
    theme(text = element_text(size = 13)) +
    ggtitle("Relationship between Term 1 Grades and Final Grades") +
    geom_smooth(method = "lm", se = FALSE)

G1_linear_plot
 
# calculate RMSE of best fit line on training data
lm_test_results <- student_fit %>%
         predict(student_test) %>%
         bind_cols(student_test) %>%
         metrics(truth = G3, estimate = .pred)

lm_test_results
 
# calculate RMSPE using test data
lm_rmspe <- lm_test_results %>%
          filter(.metric == "rmse") %>%
          select(.estimate) %>%
          pull()
lm_rmspe


In [None]:
# knn-regression with G2 as predictor

# set the seed
set.seed(1)

# create recipe to preprocess data
student_recipe <- recipe(G3 ~ G2, data = student_train) %>%
    step_scale(all_predictors()) %>%
    step_center(all_predictors())

# model specification
student_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
    set_engine("kknn") %>%
    set_mode("regression")

# 10-fold cross validation object
student_vfold <- vfold_cv(student_train, v = 10, strata = G3)

student_workflow <- workflow() %>%
    add_recipe(student_recipe) %>%
    add_model(student_spec)

student_workflow

# run cross validation for a grid of numbers of neighbors ranging from 1 to 200
gridvals <- tibble(neighbors = seq(1:200))

# tune the model and return the RMSPE for each number of neighbor
student_results <- student_workflow %>%
    tune_grid(resamples = student_vfold, grid = gridvals) %>%
    collect_metrics()

# find the minimum RMSPE
student_min <- student_results %>%
    filter(.metric == "rmse") %>%
    filter(mean == min(mean))
    #arrange(desc (mean, std_error)) %>%
    #slice (200)

student_min

# retrain model on training data using K value with smallest RMSE
k_min <- student_min %>%
    pull(neighbors)

student_best_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = k_min) %>%
    set_engine("kknn") %>%
    set_mode("regression")

student_best_fit <- workflow() %>%
    add_recipe(student_recipe) %>%
    add_model(student_best_spec) %>%
    fit(data = student_train)

# assess RMSPE of test data predictions
student_summary <- student_best_fit %>%
    predict(student_test) %>%
    bind_cols(student_test) %>%
    metrics(truth = G3, estimate = .pred)

student_summary

# create a visualization of the analysis 
student_preds <- student_best_fit %>%
    predict(student_train) %>%
    bind_cols(student_train)

G2_plot <- ggplot(student_preds, aes(x=G2, y=G3)) +
    geom_point() +
    geom_line(data = student_preds, mapping = aes(x= G2, y= .pred), colour = "blue") +
    labs(x= "Term 2 Grades", y = "Final Grades") +
    theme(text= element_text(size=20)) +
    ggtitle(paste0("K = ", k_min))
G2_plot

In [None]:
# linear regression with G2 as predictor
 
# set the seed
set.seed(1)

# model specification
student_spec <- linear_reg() %>% 
       set_engine("lm") %>%
       set_mode("regression")

# create recipe to preprocess data
student_recipe <- recipe(G3 ~ G2, data = student_train)
 
# establish Workflow
student_fit <- workflow() %>%
       add_recipe(student_recipe) %>%
       add_model(student_spec) %>%
       fit(data = student_train)
 
# visualize model predictions
G2_linear_plot <- ggplot(student_train, aes(x = G2, y = G3)) +
    geom_point() +
    labs(x= "Term 2 Grades (0 to 20)", y = "Final Grades (0 to 20)") +
    theme(text = element_text(size = 13)) +
    ggtitle("Relationship between Term 2 Grades and Final Grades") +
    geom_smooth(method = "lm", se = FALSE)

G2_linear_plot
 
# calculate RMSE of best fit line on training data
lm_test_results <- student_fit %>%
         predict(student_test) %>%
         bind_cols(student_test) %>%
         metrics(truth = G3, estimate = .pred)

lm_test_results

# calculate RMSPE using test data:
lm_rmspe <- lm_test_results %>%
          filter(.metric == "rmse") %>%
          select(.estimate) %>%
          pull()

lm_rmspe

In [None]:
# knn-regression with studytime as predictor

# set the seed
set.seed(1)
 
# perform the data analysis
 
# create recipe to preprocess data
student_recipe <- recipe(G3 ~ studytime, data = student_train) %>%
    step_scale(all_predictors()) %>%
    step_center(all_predictors())
 
# model specification
student_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
    set_engine("kknn") %>%
    set_mode("regression")
 
# 10-fold cross validation object
student_vfold <- vfold_cv(student_train, v = 10, strata = G3)
 
student_workflow <- workflow() %>%
    add_recipe(student_recipe) %>%
    add_model(student_spec)
 
student_workflow
 
# run cross validation for a grid of numbers of neighbors ranging from 1 to 200
gridvals <- tibble(neighbors = seq(1:200))
 
# tune the model and return the RMSPE for each number of neighbor
student_results <- student_workflow %>%
    tune_grid(resamples = student_vfold, grid = gridvals) %>%
    collect_metrics()
 
# find the minimum RMSPE
student_min <- student_results %>%
    filter(.metric == "rmse") %>%
    filter(mean == min(mean))
    #arrange(desc (mean, std_error)) %>%
    #slice (200)
 
student_min
 
# retrain model on training data using K value with smallest RMSE
k_min <- student_min %>%
    pull(neighbors)
 
student_best_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = k_min) %>%
    set_engine("kknn") %>%
    set_mode("regression")
 
student_best_fit <- workflow() %>%
    add_recipe(student_recipe) %>%
    add_model(student_best_spec) %>%
    fit(data = student_train)
 
# assess RMSPE of test data predictions
student_summary <- student_best_fit %>%
    predict(student_test) %>%
    bind_cols(student_test) %>%
    metrics(truth = G3, estimate = .pred)
 
student_summary
 
# create a visualization of the analysis
student_preds <- student_best_fit %>%
    predict(student_train) %>%
    bind_cols(student_train)
 
studytime_plot <- ggplot(student_preds, aes(x=studytime, y=G3)) +
    geom_point() +
    geom_line(data = student_preds, mapping = aes(x= studytime, y= .pred), colour = "blue") +
    labs(x= "Studytime (0 to 4)", y = "Final Grades (0 to 20)") +
    theme(text= element_text(size=20)) +
    ggtitle(paste0("K = ", k_min))

studytime_plot


In [None]:
# knn-regression with absences as a predictor
 
# set the seed
set.seed(1)
 
# perform the data analysis
 
# create recipe to preprocess data
student_recipe <- recipe(G3 ~ absences, data = student_train) %>%
    step_scale(all_predictors()) %>%
    step_center(all_predictors())
 
# model specification
student_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
    set_engine("kknn") %>%
    set_mode("regression")
 
# 10-fold cross validation object
student_vfold <- vfold_cv(student_train, v = 10, strata = G3)
 
student_workflow <- workflow() %>%
    add_recipe(student_recipe) %>%
    add_model(student_spec)
 
student_workflow
 
# run cross validation for a grid of numbers of neighbors ranging from 1 to 200
gridvals <- tibble(neighbors = seq(1:200))
 
# tune the model and return the RMSPE for each number of neighbor
student_results <- student_workflow %>%
    tune_grid(resamples = student_vfold, grid = gridvals) %>%
    collect_metrics()
 
# find the minimum RMSPE
student_min <- student_results %>%
    filter(.metric == "rmse") %>%
    filter(mean == min(mean))
    #arrange(desc (mean, std_error)) %>%
    #slice (200)
 
student_min
 
# retrain model on training data using K value with smallest RMSE
k_min <- student_min %>%
    pull(neighbors)
 
student_best_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = k_min) %>%
    set_engine("kknn") %>%
    set_mode("regression")
 
student_best_fit <- workflow() %>%
    add_recipe(student_recipe) %>%
    add_model(student_best_spec) %>%
    fit(data = student_train)
 
# assess RMSPE of test data predictions
student_summary <- student_best_fit %>%
    predict(student_test) %>%
    bind_cols(student_test) %>%
    metrics(truth = G3, estimate = .pred)
 
student_summary
 
# create a visualization of the analysis
student_preds <- student_best_fit %>%
    predict(student_train) %>%
    bind_cols(student_train)
 
absences_plot <- ggplot(student_preds, aes(x=absences, y=G3)) +
    geom_point() +
    geom_line(data = student_preds, mapping = aes(x= G2, y= .pred), colour = "blue") +
    labs(x= "Absences", y = "Final Grades (0 to 20)") +
    theme(text= element_text(size=20)) +
    ggtitle(paste0("K = ", k_min))

absences_plot

## Discussion:
- summarize what you found
- discuss whether this is what you expected to find?
- discuss what impact could such findings have?
- discuss what future questions could this lead to?

## References
- At least 2 citations of literature relevant to the project (format is your choice, just be consistent across the references).
- Make sure to cite the source of your data as well.