# Task 2: Correlation of speaker/listener age difference and characteristic 'attraktiv'

In [1]:
# install.packages('dplyr')   # processing 
# install.packages('gdata')   # file reading
# install.packages('DMwR')    # knn implementation for predicting missing values
# install.packages('ppcor')   # partial correlation

In [2]:
# include libraries
library(dplyr)
library(gdata)
library(ppcor)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

gdata: Unable to locate valid perl interpreter
gdata: 
gdata: read.xls() will be unable to read Excel XLS and XLSX files
gdata: unless the 'perl=' argument is used to specify the location of a
gdata: valid perl intrpreter.
gdata: 
gdata: (To avoid display of this message in the future, please ensure
gdata: perl is installed and available on the executable search path.)
gdata: Unable to load perl libaries needed by read.xls()
gdata: to support 'XLX' (Excel 97-2004) files.

gdata: Unable to load perl libaries needed by read.xls()
gdata: to support 'XLSX' (Excel 2007+) files.

gdata: Run the function 'installXLSXsupport()'
gdata: to automatically download and install the perl
gdata: libaries needed to support Excel XLS and XLSX formats.

Attaching package: 'gdata'

The following objects are mas

# Read in data

In [3]:
# read in spreadsheet
data <- read.csv('datasets/DB02_speaker_likeability_dimension_ratings.csv')[c('speaker_age', 'listener_age', 'attraktiv')]
data

speaker_age,listener_age,attraktiv
28,20,83
28,21,100
28,24,43
28,20,57
28,21,0
28,22,87
28,22,36
28,22,80
28,23,86
28,25,53


# Compute age differences

In [4]:
data <- data %>% mutate(age_difference = speaker_age - listener_age) %>% dplyr::select(age_difference, attraktiv)
data

age_difference,attraktiv
8,83
7,100
4,43
8,57
7,0
6,87
6,36
6,80
5,86
3,53


# Preferred choice: Kendall 
# Because age difference is naturally limited and therefore very many ties in sample, non-continuous (only discrete) data, small data set
# Also generally better than Spearman at encompassing correlation in population (ref: Howell, 1997, Field p.181)

In [5]:
corr_kendall <- cor(as.matrix(data), method=c("kendall"))

# correlation matrix symmetric: scratch data below main diagonal not to have same pairs twice
include_kendall <- upper.tri(corr_kendall) 

paste0('Kendall correlation coefficient (tao) between age difference and attraktiv: ', round(corr_kendall[include_kendall], 5))

# Also possible: Spearman coefficient corrected for ties because data at least ordinal, widely used, but not as precise as Kendall for many ties

In [6]:
corr_spearman <- cor(as.matrix(data), method=c("spearman"))

# correlation matrix symmetric: scratch data below main diagonal not to have same pairs twice
include_spearman <- upper.tri(corr_spearman) 

paste0('Spearman correlation coefficient (rho) between age difference and attraktiv: ', round(corr_spearman[include_spearman], 5))