In [1]:
library(tidyverse)
library(repr)
library(tidymodels)
library(plyr)
library(dplyr)
library(gridExtra)
library(GGally)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.3     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0

“package ‘ggplot2’ was built under R version 4.0.1”
“package ‘tibble’ was built under R version 4.0.2”
“package ‘tidyr’ was built under R version 4.0.2”
“package ‘dplyr’ was built under R version 4.0.2”
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

“package ‘tidymodels’ was built under R version 4.0.2”
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 0.1.1 ──

[32m✔

In [2]:
student_p <- tempfile () #Creating temporary file name inorder to download the zip folder
download.file("https://archive.ics.uci.edu/ml/machine-learning-databases/00320/student.zip", student_p)
unzip(student_p, "student-por.csv") #unzip the specific file in zip folder
unlink(student_p) #delete the temporary file 
por_data <- read.csv("student-por.csv", sep = ";") #read file as .csv with seperation of ";"

In [3]:
student_m <- tempfile () 
download.file("https://archive.ics.uci.edu/ml/machine-learning-databases/00320/student.zip", student_m)
unzip(student_m, "student-mat.csv")
unlink(student_m)
mat_data <- read.csv("student-mat.csv", sep = ";")

In [4]:
mat_data_1 <- mat_data %>% select(G3, studytime, schoolsup, internet, absences) %>% #select variables will be used in analysis
    mutate(studytime = as.factor(studytime)) %>%
    mutate(schoolsup = as.factor(schoolsup)) %>%
    mutate(internet = as.factor(internet)) %>%
    mutate(absences = as.numeric(absences)) # correct the label for variables
glimpse(mat_data_1) #show number of observations 

Rows: 395
Columns: 5
$ G3        [3m[90m<int>[39m[23m 6, 6, 10, 15, 10, 15, 11, 6, 19, 15, 9, 12, 14, 11, 16, 14,…
$ studytime [3m[90m<fct>[39m[23m 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 3, 1, 3, 2, 1, 1,…
$ schoolsup [3m[90m<fct>[39m[23m yes, no, yes, no, no, no, no, yes, no, no, no, no, no, no, …
$ internet  [3m[90m<fct>[39m[23m no, yes, yes, yes, no, yes, yes, no, yes, yes, yes, yes, ye…
$ absences  [3m[90m<dbl>[39m[23m 6, 4, 10, 2, 4, 10, 0, 6, 0, 0, 0, 4, 2, 2, 0, 4, 6, 4, 16,…


The number of observations for student performance in math dataset is 395 (minus the row of variable name). 

In [5]:
mat_split <- initial_split(mat_data_1, prop = 0.75, strata = G3) #split training dataset, 75%
mat_train <- training(mat_split) #training data
stat <- c("Min.","1st Qu.", "Median", "Mean", "3rd Qu.", "Max.")
sum_mat_G3 <- summary(mat_train$G3)
sum_mat_absences <- summary(mat_train$absences)
tibble(stat, sum_mat_G3, sum_mat_absences) #summary of two numeric variables in math training dataset

stat,sum_mat_G3,sum_mat_absences
<chr>,<table>,<table>
Min.,0.0,0.0
1st Qu.,8.0,0.0
Median,11.0,4.0
Mean,10.49,5.738
3rd Qu.,14.0,8.0
Max.,20.0,75.0


In [6]:
por_data_1 <- por_data %>% select(G3, studytime, schoolsup, internet, absences) %>%
    mutate(studytime = as.factor(studytime)) %>%
    mutate(schoolsup = as.factor(schoolsup)) %>%
    mutate(internet = as.factor(internet)) %>%
    mutate(absences = as.numeric(absences))
glimpse(por_data_1)

Rows: 649
Columns: 5
$ G3        [3m[90m<int>[39m[23m 11, 11, 12, 14, 13, 13, 13, 13, 17, 13, 14, 13, 12, 13, 15,…
$ studytime [3m[90m<fct>[39m[23m 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 3, 1, 3, 2, 1, 1,…
$ schoolsup [3m[90m<fct>[39m[23m yes, no, yes, no, no, no, no, yes, no, no, no, no, no, no, …
$ internet  [3m[90m<fct>[39m[23m no, yes, yes, yes, no, yes, yes, no, yes, yes, yes, yes, ye…
$ absences  [3m[90m<dbl>[39m[23m 4, 2, 6, 0, 0, 6, 0, 2, 0, 0, 2, 0, 0, 0, 0, 6, 10, 2, 2, 6…


The number of observations for student performance in Portuguese dataset is 648 (minus the row of variable name).

In [7]:
set.seed(100)
por_split <- initial_split(por_data_1, prop = 0.6, strata = G3)
por_train <- training(por_split) #training data
sum_por_G3 <- summary(mat_train$G3)
sum_por_absences <- summary(mat_train$absences)
tibble(stat, sum_mat_G3, sum_mat_absences) #summary of two numeric variables in por training dataset

stat,sum_mat_G3,sum_mat_absences
<chr>,<table>,<table>
Min.,0.0,0.0
1st Qu.,8.0,0.0
Median,11.0,4.0
Mean,10.49,5.738
3rd Qu.,14.0,8.0
Max.,20.0,75.0
