# Task 5: Mix task 4 and 5 to predict the VQ in two steps
## Predict VF, VU, VD for dataset again based on the best fit in task 4 
## Use predicted VF, VU and VD from step 1 and predict VQ based on the task 3
## Report the results in terms of RMSE, R-squared for predicted VQ and Ground Truth VQ

In [1]:
# install.packages('dplyr')      # processing 
# install.packages('gdata')      # file reading
# install.packages('tidyverse')  # includes regression algorithms
# install.packages('Hmisc')      # correlation 

In [2]:
library(readxl)     # reading in data
library(tidyverse)  # regression algorithms
library(ggplot2)    # residual plot
library(Hmisc)      # correlation 

-- Attaching packages --------------------------------------- tidyverse 1.3.0 --
v ggplot2 3.2.1     v purrr   0.3.3
v tibble  2.1.3     v dplyr   0.8.3
v tidyr   1.0.0     v stringr 1.4.0
v readr   1.3.1     v forcats 0.4.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
Loading required package: lattice
Loading required package: survival
Loading required package: Formula

Attaching package: 'Hmisc'

The following objects are masked from 'package:dplyr':

    src, summarize

The following objects are masked from 'package:base':

    format.pval, units



# Global constants for MOS prediction & outlier removal (optional)

In [3]:
# VU, VF, VD, VQ values are each MOS of these attributes
predict_MOS <- TRUE

# optional univariate outlier removal
remove_univariate_outliers <- TRUE

# Getter for input data set based on global constants (features from Task 1, Task 2 included)

In [4]:
get_gaming_data <- function() {
    # read in data set
    gaming_data <- read_excel("datasets/DB01_gaming_video_quality_dataset.xlsx")
    
    # Compute MOS per Game, Condition [Task 1] 
    mos_per_condition_and_game <- gaming_data %>%
                              dplyr::group_by(Game, Condition) %>%
                              dplyr::summarise(meanVQ = mean(VQ), meanVF = mean(VF), meanVU = mean(VU), meanVD = mean(VD)) %>%
                              dplyr::select(Game, Condition, meanVQ, meanVF, meanVU, meanVD)
    
    # Feature Transformation [Task 2]
    gaming_data_feature_transformation <- gaming_data %>% 
                                        dplyr::mutate(Number_Of_Pixels = ifelse(Resolution == 480, 480*720, 
                                                                    ifelse(Resolution == 720, 720*1280, 
                                                                          ifelse(Resolution == 1080, 1080*1920, -1)
                                                                          )
                                                                  )
                                              ) %>%
                                        dplyr::mutate(Bits_Per_Pixel = Bitrate / Number_Of_Pixels) %>%
                                        dplyr::select(Game, Condition, VF, VU, VD, VQ, Framerate, Number_Of_Pixels, Bits_Per_Pixel)
    
    # Merge data for single data source
    gaming_data <- merge(mos_per_condition_and_game, gaming_data_feature_transformation)
    
    # either predict MOS values OR single ratings
    if(predict_MOS) {
        gaming_data <- unique(gaming_data %>% dplyr::select(Game, Condition, meanVQ, meanVF, meanVU, meanVD, Framerate, Number_Of_Pixels, Bits_Per_Pixel)) %>% 
                       rename(VF = meanVF, VU = meanVU, VD = meanVD, VQ = meanVQ) %>% 
                       arrange(Game, Condition)
    } else {
        gaming_data <- gaming_data %>% 
                       arrange(Game, Condition)
    }
    
    # formatting
    rownames(gaming_data) <- NULL
    
    # optional outlier removal (only univariate outliers in VD)
    if(remove_univariate_outliers) {
        bounds <- gaming_data %>% summarise(lower = quantile(VD, c(0.25)), upper = quantile(VD, c(0.75))) %>% 
                        mutate(IQR = upper - lower) %>% 
                        mutate(lower_bound_VD = lower - 1.5*IQR, upper_bound_VD = upper + 1.5*IQR) %>% 
                        dplyr::select(lower_bound_VD, upper_bound_VD)

        gaming_data <- gaming_data %>% 
                            dplyr::filter(VD >= as.double(bounds$lower_bound_VD)) %>% 
                            dplyr::filter(VD <= as.double(bounds$upper_bound_VD))
    }
    
    gaming_data
}

get_gaming_data()

Game,Condition,VQ,VF,VU,VD,Framerate,Number_Of_Pixels,Bits_Per_Pixel
Game1,4,2.948,2.796,3.204,4.840,30,921600,0.0010850694
Game1,5,4.112,3.760,4.392,4.924,30,2073600,0.0019290123
Game1,15,1.988,2.232,2.260,5.092,30,345600,0.0008680556
Game1,19,3.068,3.724,2.876,5.092,60,345600,0.0057870370
Game1,25,5.080,5.340,4.544,5.536,60,921600,0.0542534722
Game1,33,2.568,2.344,3.208,4.744,60,921600,0.0010850694
Game1,34,2.740,3.400,2.440,4.956,60,345600,0.0028935185
Game1,35,3.876,4.936,3.368,5.316,60,345600,0.0173611111
Game1,36,2.964,2.492,4.216,4.916,60,2073600,0.0009645062
Game1,39,4.424,4.584,4.252,4.972,30,921600,0.0043402778


# Set input data set globally for reusability

In [5]:
# make data set globally accessible for reusability
gaming_data <- get_gaming_data()

# Define helper functions

In [6]:
# prints model summary, RMSE, adjusted R-Squared values & optionally Spearman correlation between variables incl. p values
print_statistics <- function(model, include_spearman=FALSE) {
    summary <- summary(model)
    rmse <- sqrt(mean((as.vector(gaming_data$VQ) - as.vector(fitted(model)))^2))
    
    print(paste0('RMSE: ', round(rmse, digits=5)))
    print(paste0('Adjusted R-Squared (necessary as multiple explanatory variables): ', 
                 round(summary$adj.r.squared, digits=5)))
    
    if(missing(include_spearman) == FALSE && include_spearman == TRUE) {
        corr_prediction_VQ <- rcorr(as.matrix(cbind(gaming_data %>% dplyr::select(VQ), predict(model))), type=c("spearman"))
        print(paste0('Spearman correlation between VQ & Predicted VQ: ', 
                     round(as.double(corr_prediction_VQ$r[1,2]), digits=5), 
                     ' (p value: ', round(as.double(corr_prediction_VQ$P[1,2]), digits=5), ')'))
    }
}

In [7]:
# returns fitted Multiple Linear regression model based on predicted attributes
# if no argument provided: ground truth values taken from data set
get_trained_VQ_model <- function(predicted=NULL) {
    if(missing(predicted) || is.null(predicted)) {
        gaming_data <- get_gaming_data()
        return(lm(VQ ~ VF + VU + VD, data = gaming_data))
    } else {
        return(lm(VQ ~ VF_predicted + VU_predicted + VD_predicted, data = predicted))
    }
}

# Fit model and display results

In [8]:
# get predicted values (from task 4)
VF_predicted <- predict(nls(VF ~ a * log ( Bits_Per_Pixel ) - exp( b * Bits_Per_Pixel) - c, data = gaming_data, start = c(a = -10000, b = -10, c = -10001)))
VU_predicted <- predict(nls(VU ~ a*Bits_Per_Pixel^5 + b*Bits_Per_Pixel^4 + c*Bits_Per_Pixel^3 + d*Bits_Per_Pixel^2 + e*Bits_Per_Pixel + f, data = gaming_data, start = c(a = -10000, b = -10000, c = -10000, d = -10000, e = -10000, f = -10000)))

VD_predicted <- predict(nls(VD ~ a*Framerate + c, data = gaming_data, start = c(a = -10000, c = -10000)))

In [9]:
# merge predicted and true values for considered attributes
VF_data <- cbind(gaming_data %>% dplyr::select(Game, Condition, VF), VF_predicted)
VU_data <- cbind(gaming_data %>% dplyr::select(VU), VU_predicted)
VD_data <- cbind(gaming_data %>% dplyr::select(VD), VD_predicted)
VQ_data <- gaming_data %>% dplyr::select(VQ)

In [10]:
# fit model predicting VQ based on predicted VF, VU, VD values & display results
VQ_final_model <- get_trained_VQ_model(cbind(VF_data, VU_data, VD_data, VQ_data))
model_results <- cbind(cbind(VF_data, VU_data, VD_data, VQ_data, predict(VQ_final_model))) %>% rename(VQ_predicted = 'predict(VQ_final_model)')

model_results

Game,Condition,VF,VF_predicted,VU,VU_predicted,VD,VD_predicted,VQ,VQ_predicted
Game1,4,2.796,3.023578,3.204,3.278306,4.840,4.874356,2.948,3.121572
Game1,5,3.760,3.598656,4.392,3.945806,4.924,4.874356,4.112,3.785073
Game1,15,2.232,2.798486,2.260,3.023101,5.092,4.874356,1.988,2.865944
Game1,19,3.724,4.652483,2.876,3.027299,5.092,5.016783,3.068,3.354979
Game1,25,5.340,5.424799,4.544,4.916152,5.536,5.016783,5.080,4.912864
Game1,33,2.344,3.023578,3.208,3.278306,4.744,5.016783,2.568,2.914657
Game1,34,3.400,3.996851,2.440,4.189996,4.956,5.016783,2.740,3.891101
Game1,35,4.936,5.525654,3.368,3.293702,5.316,5.016783,3.876,3.860420
Game1,36,2.492,2.904878,4.216,3.141133,4.916,5.016783,2.964,2.778112
Game1,39,4.584,4.385473,4.252,3.843911,4.972,4.874356,4.424,4.010765


In [11]:
# print per-attribute summary (note: NO extrapolation :] )
summary(model_results)

     Game             Condition           VF         VF_predicted  
 Length:143         Min.   : 4.00   Min.   :1.856   Min.   :2.798  
 Class :character   1st Qu.:23.50   1st Qu.:3.298   1st Qu.:3.311  
 Mode  :character   Median :42.00   Median :4.300   Median :3.997  
                    Mean   :42.02   Mean   :4.141   Mean   :4.141  
                    3rd Qu.:61.00   3rd Qu.:5.099   3rd Qu.:5.039  
                    Max.   :80.00   Max.   :5.928   Max.   :5.688  
       VU         VU_predicted         VD         VD_predicted  
 Min.   :1.948   Min.   :3.023   Min.   :4.372   Min.   :4.874  
 1st Qu.:3.034   1st Qu.:3.210   1st Qu.:4.785   1st Qu.:4.874  
 Median :3.852   Median :3.946   Median :4.936   Median :5.017  
 Mean   :3.876   Mean   :3.876   Mean   :4.946   Mean   :4.946  
 3rd Qu.:4.621   3rd Qu.:4.190   3rd Qu.:5.090   3rd Qu.:5.017  
 Max.   :5.891   Max.   :5.605   Max.   :5.536   Max.   :5.017  
       VQ         VQ_predicted  
 Min.   :1.718   Min.   :2.659  
 1s

In [12]:
# display coefficients, R-Squared values
summary(VQ_final_model)


Call:
lm(formula = VQ ~ VF_predicted + VU_predicted + VD_predicted, 
    data = predicted)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.15402 -0.28292  0.01441  0.38252  1.36693 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)   6.86980    3.21716   2.135   0.0345 *  
VF_predicted  0.37386    0.05868   6.372 2.56e-09 ***
VU_predicted  0.67192    0.07483   8.979 1.72e-15 ***
VD_predicted -1.45278    0.64852  -2.240   0.0267 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5523 on 139 degrees of freedom
Multiple R-squared:  0.6805,	Adjusted R-squared:  0.6736 
F-statistic:  98.7 on 3 and 139 DF,  p-value: < 2.2e-16


In [13]:
# Lastly: print RMSE, (adjusted) R-Squared value as requested
print_statistics(VQ_final_model, TRUE)

[1] "RMSE: 0.54448"
[1] "Adjusted R-Squared (necessary as multiple explanatory variables): 0.67363"
[1] "Spearman correlation between VQ & Predicted VQ: 0.77605 (p value: 0)"
