## Task 1: Correlation of speaker characteristics

In [1]:
# install.packages('dplyr')   # processing 
# install.packages('gdata')   # file reading
# install.packages('DMwR')    # knn implementation for predicting missing values
# install.packages('Hmisc')   # correlation matricies
# install.packages('psych')   # vectorized computation for Kendall correlation

In [2]:
# include libraries
library(dplyr)
library(gdata)
library(DMwR)
library(Hmisc)
library(psych)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

gdata: Unable to locate valid perl interpreter
gdata: 
gdata: read.xls() will be unable to read Excel XLS and XLSX files
gdata: unless the 'perl=' argument is used to specify the location of a
gdata: valid perl intrpreter.
gdata: 
gdata: (To avoid display of this message in the future, please ensure
gdata: perl is installed and available on the executable search path.)
gdata: Unable to load perl libaries needed by read.xls()
gdata: to support 'XLX' (Excel 97-2004) files.

gdata: Unable to load perl libaries needed by read.xls()
gdata: to support 'XLSX' (Excel 2007+) files.

gdata: Run the function 'installXLSXsupport()'
gdata: to automatically download and install the perl
gdata: libaries needed to support Excel XLS and XLSX formats.

Attaching package: 'gdata'

The following objects are mas

# Read in data

In [3]:
# read in spreadsheet
data <- read.csv('datasets/DB02_speaker_likeability_dimension_ratings.csv')[-c(1:9)]

In [4]:
# predict missing labels by taking label of nearest feature vector 
# because we don't want to lose entire records
data <- as.data.frame(knnImputation(data,k=2))
data

unsympatisch,sicher,attraktiv,verstaendnislos,unentschieden,unaufdringlich,distanziert,gelangweilt,emotional,nicht.genervt,...,freundlich,maennlich,gehorsam,gleichgueltig,interessant,zynisch,aufgesetzt,intelligent,kindlich,bescheiden
18,79,83,86,18,72,12,18,64,82,...,78,10,67,75,76,26,12,87,60,71
0,100,100,0,0,100,0,0,29,100,...,100,100,100,0,100,0,0,83,70,100
19,62,43,19,39,84,19,33,56,85,...,81,0,86,42,53,16,22,67,38,81
33,56,57,44,34,30,41,40,56,86,...,72,5,40,24,63,62,29,57,30,47
100,32,0,28,81,87,76,67,16,31,...,79,28,87,25,28,77,79,70,74,83
3,88,87,15,9,86,19,25,75,68,...,89,13,47,29,88,45,17,87,17,54
65,59,36,73,34,45,43,66,43,70,...,43,36,42,71,43,68,37,21,40,42
32,72,80,27,40,59,24,28,83,76,...,94,20,16,21,74,40,26,59,66,26
34,79,86,33,4,77,38,6,61,85,...,87,0,62,17,79,22,41,88,14,59
46,57,53,42,45,56,42,47,54,59,...,55,32,63,45,45,44,45,57,41,56


# Note: Continuous variables can hold an infinite number of values between the lowest and highest measured values. However, we only retrieved integer values between 0-100, therefore we only have discrete data. As Pearson correlation requires continuous data, it should not be applied in this use case.

# Option 1: Spearman, because (just) discrete ratings between 0-100, sample size subjectively NOT small anymore (hence not Kendall), used more often than Kendall

In [13]:
# compute correlation matrix and corresponding p values

corr_data_spearman <- rcorr(as.matrix(data), type=c("spearman"))

In [14]:
# display correlation coefficients
as.data.frame(corr_data_spearman$r)

Unnamed: 0,unsympatisch,sicher,attraktiv,verstaendnislos,unentschieden,unaufdringlich,distanziert,gelangweilt,emotional,nicht.genervt,...,freundlich,maennlich,gehorsam,gleichgueltig,interessant,zynisch,aufgesetzt,intelligent,kindlich,bescheiden
unsympatisch,1.0,-0.15956763,-0.54634105,0.51581503,0.1181246,-0.223885347,0.53073427,0.38713448,-0.33272202,-0.41452301,...,-0.549891862,0.0445877243,-0.137017814,0.33067333,-0.457265328,0.166934025,0.2641769166,-0.378262682,0.016647314,-0.212087856
sicher,-0.15956763,1.0,0.23612873,-0.15733098,-0.65441329,-0.022269079,-0.11877605,-0.24112153,0.08269765,0.11085358,...,0.151565633,0.0363720279,-0.23975341,-0.173964972,0.273648284,0.18189333,-0.1282116643,0.3713008684,-0.318125631,-0.137047168
attraktiv,-0.546341054,0.23612873,1.0,-0.32594716,-0.13220255,0.161918152,-0.38962221,-0.36852889,0.37065739,0.30384082,...,0.431279142,-0.0930581523,0.04018687,-0.315828873,0.56176755,-0.076944178,-0.2006194071,0.4659024407,-0.037537023,0.093916606
verstaendnislos,0.515815035,-0.15733098,-0.32594716,1.0,0.16096128,-0.244667638,0.45010439,0.41434219,-0.2690049,-0.44768057,...,-0.548097988,0.0829645601,-0.166637686,0.341897717,-0.31574462,0.228201176,0.2212800503,-0.3155493554,0.039856361,-0.253507173
unentschieden,0.118124599,-0.65441329,-0.13220255,0.16096128,1.0,0.019007901,0.07116511,0.24435372,-0.0287619,-0.11037387,...,-0.138094357,0.0200533766,0.173136825,0.185349628,-0.199309639,-0.122521644,0.1040328061,-0.3196181736,0.25759422,0.07948749
unaufdringlich,-0.223885347,-0.02226908,0.16191815,-0.24466764,0.0190079,1.0,-0.10334396,-0.05785204,-0.06339855,0.30850106,...,0.334676889,0.062614455,0.405501718,-0.000595918,0.069273818,-0.36215836,-0.2877224849,0.1832331958,-0.056530095,0.386607932
distanziert,0.530734271,-0.11877605,-0.38962221,0.45010439,0.07116511,-0.103343964,1.0,0.41144629,-0.41914851,-0.38622243,...,-0.473873065,0.0493121741,-0.095693801,0.393765226,-0.381413293,0.13730058,0.2142861232,-0.218613169,-0.071693823,-0.118046463
gelangweilt,0.38713448,-0.24112153,-0.36852889,0.41434219,0.24435372,-0.057852038,0.41144629,1.0,-0.44297781,-0.37804262,...,-0.484251502,0.0936858796,-0.042330583,0.565593062,-0.467033291,0.136426861,0.176192964,-0.3445265879,0.033593948,-0.07624114
emotional,-0.332722025,0.08269765,0.37065739,-0.2690049,-0.0287619,-0.06339855,-0.41914851,-0.44297781,1.0,0.25646949,...,0.358975425,-0.0989429568,0.038980393,-0.415398502,0.421936303,-0.072799081,-0.0811235173,0.1622977254,0.121205737,0.026415194
nicht.genervt,-0.414523005,0.11085358,0.30384082,-0.44768057,-0.11037387,0.308501059,-0.38622243,-0.37804262,0.25646949,1.0,...,0.572384161,-0.0361261757,0.26873223,-0.303503262,0.294066991,-0.299420179,-0.2422518113,0.2528364653,0.018012112,0.32467644


In [15]:
# display p values of correlation
as.data.frame(corr_data_spearman$P) 

Unnamed: 0,unsympatisch,sicher,attraktiv,verstaendnislos,unentschieden,unaufdringlich,distanziert,gelangweilt,emotional,nicht.genervt,...,freundlich,maennlich,gehorsam,gleichgueltig,interessant,zynisch,aufgesetzt,intelligent,kindlich,bescheiden
unsympatisch,,0.0,0.0,0.0,1.776357e-15,0.0,0.0,0.0,0.0,0.0,...,0.0,0.002717532,0.0,0.0,0.0,0.0,0.0,0.0,0.2632004,0.0
sicher,0.0,,0.0,0.0,0.0,0.1344513,1.110223e-15,0.0,2.588265e-08,7.838175e-14,...,0.0,0.01447767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
attraktiv,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.667293e-10,0.006895561,0.0,0.0,2.234314e-07,0.0,0.0,0.01161727,2.521818e-10
verstaendnislos,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.333557e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.007370845,0.0
unentschieden,1.776357e-15,0.0,0.0,0.0,,0.2014118,1.678507e-06,0.0,0.05319419,1.003642e-13,...,0.0,0.177716,0.0,0.0,0.0,0.0,2.373879e-12,0.0,0.0,8.775189e-08
unaufdringlich,0.0,0.1344513,0.0,0.0,0.2014118,,3.310463e-12,9.974963e-05,1.99916e-05,0.0,...,0.0,2.529715e-05,0.0,0.9680544,3.144616e-06,0.0,0.0,0.0,0.0001434514,0.0
distanziert,0.0,1.110223e-15,0.0,0.0,1.678507e-06,3.310463e-12,,0.0,0.0,0.0,...,0.0,0.0009130354,1.149241e-10,0.0,0.0,0.0,0.0,0.0,1.404354e-06,1.776357e-15
gelangweilt,0.0,0.0,0.0,0.0,0.0,9.974963e-05,0.0,,0.0,0.0,...,0.0,2.789762e-10,0.004425847,0.0,0.0,0.0,0.0,0.0,0.02392599,2.878267e-07
emotional,0.0,2.588265e-08,0.0,0.0,0.05319419,1.99916e-05,0.0,0.0,,0.0,...,0.0,2.632028e-11,0.008775722,0.0,0.0,9.634687e-07,4.737247e-08,0.0,4.440892e-16,0.07580868
nicht.genervt,0.0,7.838175e-14,0.0,0.0,1.003642e-13,0.0,0.0,0.0,0.0,,...,0.0,0.01515519,0.0,0.0,0.0,0.0,0.0,0.0,0.2260485,0.0


In [16]:
include <- upper.tri(corr_data_spearman$r) # correlation matrix symmetric: scratch data below main diagonal not to have same pairs twice

correlation_pairs_spearman <- data.frame(`Speaker_Characteristic_1` = rownames(corr_data_spearman$r)[row(corr_data_spearman$r)[include]],     # get row names
                            `Speaker_Characteristic_2` = rownames(corr_data_spearman$r)[col(corr_data_spearman$r)[include]],    # get column names
                            `r_spearman` =(corr_data_spearman$r)[include],                             
                            `p_value` = corr_data_spearman$P[include]) 

# if p value < 0.05: significant correlation
significant_pairs_spearman <- correlation_pairs_spearman %>% filter(p_value < 0.05) %>% arrange(desc(r_spearman))
significant_pairs_spearman

Speaker_Characteristic_1,Speaker_Characteristic_2,r_spearman,p_value
mitfuehlend,herzlich,0.7241924,0
herzlich,freundlich,0.6539259,0
attraktiv,angenehm,0.6175530,0
angenehm,freundlich,0.5766395,0
nicht.genervt,freundlich,0.5723842,0
gelangweilt,gleichgueltig,0.5655931,0
attraktiv,interessant,0.5617676,0
mitfuehlend,freundlich,0.5565851,0
entspannt,ruhig,0.5532012,0
unsympatisch,distanziert,0.5307343,0


# Option 2 (preferred choice): Kendall, because (just) discrete ratings between 0-100, better than Spearman for correlation estimation (ref: Howell, 1997, Field p.181), a lot of ties in data, data set not too large 

In [17]:
corr_data_kendall <- corr.test(data, method="kendall")

In [18]:
# display correlation coefficients
as.data.frame(corr_data_kendall$r)

Unnamed: 0,unsympatisch,sicher,attraktiv,verstaendnislos,unentschieden,unaufdringlich,distanziert,gelangweilt,emotional,nicht.genervt,...,freundlich,maennlich,gehorsam,gleichgueltig,interessant,zynisch,aufgesetzt,intelligent,kindlich,bescheiden
unsympatisch,1.0,-0.1166463,-0.41625641,0.40385419,0.08767903,-0.166469977,0.40848956,0.2908277,-0.2466689,-0.31243944,...,-0.41973441,0.031944083,-0.099567808,0.250895055,-0.342613161,0.122237685,0.194702751,-0.278577405,0.010907146,-0.156354308
sicher,-0.116646301,1.0,0.17613298,-0.11127885,-0.52965933,-0.019035386,-0.08655974,-0.17961159,0.05933779,0.07908251,...,0.107025195,0.025057924,-0.180960008,-0.129173249,0.206356738,0.137538715,-0.09163581,0.272603533,-0.229356283,-0.104860575
attraktiv,-0.416256412,0.17613298,1.0,-0.2427587,-0.10020603,0.121802058,-0.29056356,-0.2787823,0.27818927,0.22421286,...,0.31928679,-0.06689276,0.028757205,-0.23766747,0.432918231,-0.057150578,-0.149769316,0.355746931,-0.029641221,0.068914672
verstaendnislos,0.403854185,-0.11127885,-0.2427587,1.0,0.11606762,-0.180377359,0.35406261,0.30829419,-0.19949929,-0.33903014,...,-0.417545206,0.05919293,-0.122834232,0.259662572,-0.233753827,0.167544599,0.164972533,-0.230453603,0.027617397,-0.185613271
unentschieden,0.087679033,-0.52965933,-0.10020603,0.11606762,1.0,0.016590829,0.05344833,0.18571031,-0.02242096,-0.07770458,...,-0.097220427,0.01600623,0.131372191,0.13893349,-0.151294599,-0.094199855,0.074322882,-0.233724896,0.183172509,0.063559463
unaufdringlich,-0.166469977,-0.01903539,0.12180206,-0.18037736,0.01659083,1.0,-0.07697225,-0.04376478,-0.04492691,0.2304276,...,0.248572167,0.04546671,0.305441594,-0.001665409,0.053478328,-0.2693011,-0.214831153,0.133183077,-0.038958573,0.288411794
distanziert,0.408489556,-0.08655974,-0.29056356,0.35406261,0.05344833,-0.076972246,1.0,0.31731577,-0.32049448,-0.29321416,...,-0.356172563,0.034853823,-0.071843257,0.30702743,-0.285367214,0.102239249,0.159865988,-0.160395525,-0.052114927,-0.087011537
gelangweilt,0.290827703,-0.17961159,-0.2787823,0.30829419,0.18571031,-0.043764781,0.31731577,1.0,-0.34009931,-0.28271314,...,-0.362389781,0.067609021,-0.031869633,0.447042773,-0.363529044,0.100128547,0.13130811,-0.255573896,0.02159508,-0.05614068
emotional,-0.246668896,0.05933779,0.27818927,-0.19949929,-0.02242096,-0.044926908,-0.32049448,-0.34009931,1.0,0.18980692,...,0.265776598,-0.072031656,0.031715392,-0.318036928,0.319307767,-0.056733091,-0.060952902,0.119826395,0.08874574,0.020503584
nicht.genervt,-0.31243944,0.07908251,0.22421286,-0.33903014,-0.07770458,0.230427602,-0.29321416,-0.28271314,0.18980692,1.0,...,0.43751393,-0.026088581,0.196831651,-0.228730264,0.218039786,-0.218006682,-0.178022613,0.182596109,0.014445255,0.240412483


In [19]:
# display p values of correlation: if p value < 0.05: significant correlation
as.data.frame(corr_data_kendall$p) 

Unnamed: 0,unsympatisch,sicher,attraktiv,verstaendnislos,unentschieden,unaufdringlich,distanziert,gelangweilt,emotional,nicht.genervt,...,freundlich,maennlich,gehorsam,gleichgueltig,interessant,zynisch,aufgesetzt,intelligent,kindlich,bescheiden
unsympatisch,0.0,8.929402e-13,3.591648e-186,3.621729e-174,6.676812e-07,5.9289960000000004e-27,1.35399e-178,3.810017e-86,5.187136e-61,3.1740679999999997e-100,...,1.2519550000000001e-189,1.0,4.08096e-09,3.2523279999999996e-63,5.827994999999999e-122,4.064874e-14,2.59446e-37,1.067601e-78,1.0,1.154635e-23
sicher,3.659591e-15,0.0,2.572695e-30,1.460453e-11,0.0,1.0,1.036644e-06,1.425804e-31,0.008476577,1.775319e-05,...,1.222906e-10,1.0,4.606643e-32,7.131121e-16,4.2741229999999997e-42,4.1721550000000004e-18,1.306675e-07,3.3716860000000005e-75,1.962368e-52,3.47633e-10
attraktiv,6.626657e-189,8.219471e-33,0.0,5.139758e-59,3.061646e-09,5.18383e-14,5.55755e-86,8.103147999999999e-79,1.8103899999999998e-78,5.105354999999999e-50,...,6.115344e-105,0.001023382,1.0,1.813148e-56,4.078379e-203,0.01464995,1.2441299999999999e-21,3.2470679999999997e-132,1.0,0.0005446107
verstaendnislos,6.731839000000001e-177,6.295054e-14,1.291397e-61,0.0,1.21128e-12,7.507967e-32,7.150603e-131,1.975343e-97,3.030435e-39,2.969239e-119,...,1.9010539999999998e-187,0.008695465,2.905058e-14,6.410935e-68,1.511744e-54,2.5755600000000002e-27,1.881815e-26,5.910007e-53,1.0,8.626971e-34
unentschieden,3.551496e-09,0.0,1.464902e-11,5.005288e-15,0.0,1.0,0.03675685,7.949435e-34,1.0,2.898019e-05,...,1.179992e-08,1.0,1.918296e-16,1.71941e-18,4.3079210000000005e-22,4.409605e-08,9.486105e-05,1.557564e-54,7.061644000000001e-33,0.002723079
unaufdringlich,1.924999e-29,0.2007607,2.098717e-16,2.353595e-34,0.2648239,0.0,3.737834e-05,0.3091772,0.2520794,6.06633e-53,...,5.342695e-62,0.2267415,1.554776e-95,1.0,0.03675685,2.657494e-73,9.085995e-46,6.317266e-17,0.7491544,1.20197e-84
distanziert,2.512041e-181,5.60348e-09,1.240524e-88,1.399335e-133,0.0003249407,2.211736e-07,0.0,1.428732e-103,8.756583000000001e-106,1.21823e-87,...,1.48417e-132,1.0,0.0002136041,1.38049e-96,8.865776e-83,1.196444e-09,8.834589e-25,5.963661e-25,0.05027198,8.722769e-07
gelangweilt,8.485561e-89,4.4978039999999995e-34,1.867085e-81,4.229856e-100,2.387218e-36,0.003254496,3.001538e-106,0.0,4.665449e-120,3.5930200000000003e-81,...,1.367438e-137,0.0008252272,1.0,3.020865e-218,1.589026e-138,3.159183e-09,1.987115e-16,1.056345e-65,1.0,0.01865287
emotional,1.280774e-63,6.57099e-05,4.2004410000000002e-81,8.633718999999999e-42,0.1318137,0.002520794,1.816718e-108,9.406147e-123,0.0,2.161185e-35,...,2.6208e-71,0.0002028725,1.0,4.537671e-104,5.924687e-105,0.01601659,0.005491412,1.566617e-13,4.323614e-07,1.0
nicht.genervt,6.724721000000001e-103,1.020298e-07,1.350623e-52,6.022797999999999e-122,1.694748e-07,1.571588e-55,2.6951999999999997e-90,8.222015e-84,6.375177e-38,0.0,...,5.836499e-208,1.0,3.6557409999999996e-38,3.867275e-52,3.386465e-47,3.495515e-47,5.3817240000000005e-31,1.152235e-32,1.0,7.790802e-58


In [20]:
include_kendall <- upper.tri(corr_data_kendall$r) # correlation matrix symmetric: scratch data below main diagonal not to have same pairs twice

correlation_pairs_kendall <- data.frame(`Speaker_Characteristic_1` = rownames(corr_data_kendall$r)[row(corr_data_kendall$r)[include_kendall]],     # get row names
                            `Speaker_Characteristic_2` = rownames(corr_data_kendall$r)[col(corr_data_kendall$r)[include_kendall]],    # get column names
                            `tao_kendall` =(corr_data_kendall$r)[include_kendall],                             
                            `p_value` = corr_data_kendall$p[include_kendall]) 

# if p value < 0.05: significant correlation
significant_pairs_kendall <- correlation_pairs_kendall %>% filter(p_value < 0.05) %>% arrange(desc(tao_kendall))
significant_pairs_kendall

Speaker_Characteristic_1,Speaker_Characteristic_2,tao_kendall,p_value
mitfuehlend,herzlich,0.5894338,0.000000e+00
herzlich,freundlich,0.5086782,2.656291e-293
attraktiv,angenehm,0.4770733,6.782612e-253
gelangweilt,gleichgueltig,0.4470428,3.020865e-218
angenehm,freundlich,0.4407402,2.091763e-211
nicht.genervt,freundlich,0.4375139,5.836499e-208
attraktiv,interessant,0.4329182,4.078379e-203
entspannt,ruhig,0.4264313,2.099992e-196
mitfuehlend,freundlich,0.4257185,1.122252e-195
unsympatisch,distanziert,0.4084896,1.353990e-178
