# Throughput predictability analysis

In [68]:
import pandas
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


IN_FNAME = "../data/feup-exp-201901/summary/final-exp-log.csv"

# load dataset
# line format: senderId,receiverId,systime,receiverX,receiverY,receiverAlt,
#              receiverSpeed,channelFreq,channelBw,chanUtil,isInLap,isIperfOn,
#              isDataReceived,rssiMean,dataRateMean,nBytesReceived
dataset = pandas.read_csv(IN_FNAME)
dataset = dataset.loc[dataset['isIperfOn'] == 1] # filter out periods when iperf was off

## Let's start by analyzing the correlation between the different features in our data set

In [69]:
# let us print out a correlation matrix (weirdly, can't farm this out to a method
# because then it won't show in the notebook)

# prepare a data set for correlation analysis
corDs = dataset.loc[dataset['senderId'].isin(('ap3','ap4'))] # filter out non-colocated APs


# drop unneeded columns
corDs.drop("systime", axis=1, inplace=True)
corDs.drop("isInLap", axis=1, inplace=True)
corDs.drop("senderId", axis=1, inplace=True)
corDs.drop("receiverId", axis=1, inplace=True)
corDs.drop("isIperfOn", axis=1, inplace=True)

#corDs # uncomment if you want to print it out

In [70]:
# show correlation matrix for pearson (measures linear relationship between normally-distributed variables)
corMat = corDs.corr(method='pearson') 
corMat.style.background_gradient(cmap='coolwarm', axis=None).set_precision(2)

Unnamed: 0,receiverDist,receiverX,receiverY,receiverAlt,receiverSpeed,channelFreq,channelBw,channelUtil,rssiMean,dataRateMean,nBytesReceived
receiverDist,1.0,0.98,-0.48,0.18,-0.068,-0.031,-0.031,-0.26,-0.46,-0.47,-0.42
receiverX,0.98,1.0,-0.6,0.22,-0.094,-0.039,-0.039,-0.25,-0.44,-0.47,-0.42
receiverY,-0.48,-0.6,1.0,-0.52,0.33,0.059,0.059,0.062,0.16,0.28,0.26
receiverAlt,0.18,0.22,-0.52,1.0,-0.75,-0.065,-0.065,0.026,-0.22,-0.29,-0.23
receiverSpeed,-0.068,-0.094,0.33,-0.75,1.0,0.066,0.066,-0.063,0.094,0.19,0.2
channelFreq,-0.031,-0.039,0.059,-0.065,0.066,1.0,1.0,0.33,0.39,0.22,0.46
channelBw,-0.031,-0.039,0.059,-0.065,0.066,1.0,1.0,0.33,0.39,0.22,0.46
channelUtil,-0.26,-0.25,0.062,0.026,-0.063,0.33,0.33,1.0,0.52,0.24,0.3
rssiMean,-0.46,-0.44,0.16,-0.22,0.094,0.39,0.39,0.52,1.0,0.7,0.58
dataRateMean,-0.47,-0.47,0.28,-0.29,0.19,0.22,0.22,0.24,0.7,1.0,0.83


In [71]:
corMat = corDs.corr(method='kendall') # non-parametric, rank based (sees how increasing rank of one variable changes rank of another)
                                      # spearman is alternative, gives higher values but is more error prone
corMat.style.background_gradient(cmap='coolwarm', axis=None).set_precision(2)

Unnamed: 0,receiverDist,receiverX,receiverY,receiverAlt,receiverSpeed,channelFreq,channelBw,channelUtil,rssiMean,dataRateMean,nBytesReceived
receiverDist,1.0,0.91,-0.32,0.16,-0.057,-0.025,-0.025,-0.11,-0.42,-0.29,-0.26
receiverX,0.91,1.0,-0.33,0.14,-0.036,-0.03,-0.03,-0.11,-0.4,-0.28,-0.26
receiverY,-0.32,-0.33,1.0,-0.52,0.28,0.05,0.05,0.063,0.24,0.22,0.18
receiverAlt,0.16,0.14,-0.52,1.0,-0.37,-0.049,-0.049,-0.059,-0.21,-0.24,-0.2
receiverSpeed,-0.057,-0.036,0.28,-0.37,1.0,0.043,0.043,0.036,0.1,0.096,0.092
channelFreq,-0.025,-0.03,0.05,-0.049,0.043,1.0,1.0,0.47,0.31,0.2,0.38
channelBw,-0.025,-0.03,0.05,-0.049,0.043,1.0,1.0,0.47,0.31,0.2,0.38
channelUtil,-0.11,-0.11,0.063,-0.059,0.036,0.47,0.47,1.0,0.34,0.27,0.39
rssiMean,-0.42,-0.4,0.24,-0.21,0.1,0.31,0.31,0.34,1.0,0.64,0.64
dataRateMean,-0.29,-0.28,0.22,-0.24,0.096,0.2,0.2,0.27,0.64,1.0,0.79


In [72]:
corMat = corDs.corr(method='spearman') # non-parametric, rank based (sees how increasing rank of one variable changes rank of another)
                                      # compared to kendall, gives higher values but is more error prone
corMat.style.background_gradient(cmap='coolwarm', axis=None).set_precision(2)

Unnamed: 0,receiverDist,receiverX,receiverY,receiverAlt,receiverSpeed,channelFreq,channelBw,channelUtil,rssiMean,dataRateMean,nBytesReceived
receiverDist,1.0,0.98,-0.5,0.3,-0.084,-0.03,-0.03,-0.17,-0.54,-0.41,-0.37
receiverX,0.98,1.0,-0.5,0.29,-0.066,-0.036,-0.036,-0.17,-0.53,-0.41,-0.37
receiverY,-0.5,-0.5,1.0,-0.68,0.43,0.061,0.061,0.088,0.33,0.31,0.25
receiverAlt,0.3,0.29,-0.68,1.0,-0.54,-0.06,-0.06,-0.086,-0.31,-0.35,-0.29
receiverSpeed,-0.084,-0.066,0.43,-0.54,1.0,0.05,0.05,0.052,0.15,0.14,0.13
channelFreq,-0.03,-0.036,0.061,-0.06,0.05,1.0,1.0,0.58,0.37,0.24,0.45
channelBw,-0.03,-0.036,0.061,-0.06,0.05,1.0,1.0,0.58,0.37,0.24,0.45
channelUtil,-0.17,-0.17,0.088,-0.086,0.052,0.58,0.58,1.0,0.49,0.38,0.54
rssiMean,-0.54,-0.53,0.33,-0.31,0.15,0.37,0.37,0.49,1.0,0.81,0.82
dataRateMean,-0.41,-0.41,0.31,-0.35,0.14,0.24,0.24,0.38,0.81,1.0,0.92


### Results discussion
* RSSI is reasonably correlated with throughput.
* Distance is reasonably negatively-correlated with throughput, as expected.
* RSSI and distance correlate pretty well.
* Channel utilization is kind of useless in this data set because there is only one client per channel.
* Data rate is pretty much a dead ringer for throughput. I'm thinking we should actually focus on predicting data rate rather than throughput. Data rate should be easier to predict as it's independent from the amount of data the client has to send.