# Throughput predictability analysis

In [261]:
import pandas

IN_FNAME = "../data/feup-exp-201901/summary/final-exp-log.csv"

# load dataset
# line format: senderId,receiverId,systime,receiverX,receiverY,receiverAlt,
#              receiverSpeed,channelFreq,channelBw,chanUtil,isInLap,isIperfOn,
#              isDataReceived,rssiMean,dataRateMean,nBytesReceived
dataset = pandas.read_csv(IN_FNAME)
dataset = dataset.loc[dataset['isIperfOn'] == 1] # filter out periods when iperf was off
dataset = dataset.loc[dataset['senderId'].isin(('ap1','ap2'))] # filter out non-colocated APs
dataset = dataset.reset_index(drop=True)

## Feature correlation analysis

In [262]:
# let us print out a correlation matrix (weirdly, can't farm this out to a method
# because then it won't show in the notebook)

# prepare a data set for correlation analysis
corDs = dataset.copy()

# drop unneeded columns
corDs.drop(['systime', 'isInLap', 'senderId', 'receiverId', 'isIperfOn'], axis=1, inplace=True)

#corDs # uncomment if you want to print it out

In [263]:
# show correlation matrix for pearson (measures linear relationship between normally-distributed variables)
corMat = corDs.corr(method='pearson') 
corMat.style.background_gradient(cmap='coolwarm', axis=None).set_precision(2)

Unnamed: 0,receiverDist,receiverX,receiverY,receiverAlt,receiverSpeed,channelFreq,channelBw,channelUtil,rssiMean,dataRateMean,nBytesReceived
receiverDist,1.0,0.2,0.35,-0.35,0.43,-0.0028,-0.0028,-0.031,-0.63,-0.53,-0.52
receiverX,0.2,1.0,-0.63,0.23,-0.11,-0.014,-0.014,-0.1,-0.06,0.038,0.071
receiverY,0.35,-0.63,1.0,-0.51,0.34,0.018,0.018,0.11,-0.35,-0.33,-0.38
receiverAlt,-0.35,0.23,-0.51,1.0,-0.74,-0.055,-0.055,-0.12,0.34,0.35,0.49
receiverSpeed,0.43,-0.11,0.34,-0.74,1.0,0.063,0.063,0.12,-0.37,-0.45,-0.53
channelFreq,-0.0028,-0.014,0.018,-0.055,0.063,1.0,1.0,-0.14,0.014,0.42,0.37
channelBw,-0.0028,-0.014,0.018,-0.055,0.063,1.0,1.0,-0.14,0.014,0.42,0.37
channelUtil,-0.031,-0.1,0.11,-0.12,0.12,-0.14,-0.14,1.0,0.37,-0.28,-0.15
rssiMean,-0.63,-0.06,-0.35,0.34,-0.37,0.014,0.014,0.37,1.0,0.55,0.58
dataRateMean,-0.53,0.038,-0.33,0.35,-0.45,0.42,0.42,-0.28,0.55,1.0,0.86


In [264]:
corMat = corDs.corr(method='kendall') # non-parametric, rank based (sees how increasing rank of one variable changes rank of another)
                                      # spearman is alternative, gives higher values but is more error prone
corMat.style.background_gradient(cmap='coolwarm', axis=None).set_precision(2)

Unnamed: 0,receiverDist,receiverX,receiverY,receiverAlt,receiverSpeed,channelFreq,channelBw,channelUtil,rssiMean,dataRateMean,nBytesReceived
receiverDist,1.0,0.31,0.16,-0.17,0.19,0.001,0.001,-0.026,-0.52,-0.36,-0.33
receiverX,0.31,1.0,-0.35,0.16,-0.044,-0.011,-0.011,-0.059,-0.073,-0.0039,0.014
receiverY,0.16,-0.35,1.0,-0.51,0.3,0.036,0.036,0.05,-0.24,-0.22,-0.25
receiverAlt,-0.17,0.16,-0.51,1.0,-0.38,-0.039,-0.039,-0.069,0.24,0.21,0.25
receiverSpeed,0.19,-0.044,0.3,-0.38,1.0,0.052,0.052,0.067,-0.24,-0.23,-0.24
channelFreq,0.001,-0.011,0.036,-0.039,0.052,1.0,1.0,-0.047,-0.13,0.3,0.3
channelBw,0.001,-0.011,0.036,-0.039,0.052,1.0,1.0,-0.047,-0.13,0.3,0.3
channelUtil,-0.026,-0.059,0.05,-0.069,0.067,-0.047,-0.047,1.0,0.16,-0.13,-0.029
rssiMean,-0.52,-0.073,-0.24,0.24,-0.24,-0.13,-0.13,0.16,1.0,0.47,0.49
dataRateMean,-0.36,-0.0039,-0.22,0.21,-0.23,0.3,0.3,-0.13,0.47,1.0,0.78


In [265]:
corMat = corDs.corr(method='spearman') # non-parametric, rank based (sees how increasing rank of one variable changes rank of another)
                                      # compared to kendall, gives higher values but is more error prone
corMat.style.background_gradient(cmap='coolwarm', axis=None).set_precision(2)

Unnamed: 0,receiverDist,receiverX,receiverY,receiverAlt,receiverSpeed,channelFreq,channelBw,channelUtil,rssiMean,dataRateMean,nBytesReceived
receiverDist,1.0,0.3,0.22,-0.22,0.28,0.0013,0.0013,-0.05,-0.71,-0.52,-0.49
receiverX,0.3,1.0,-0.52,0.31,-0.08,-0.013,-0.013,-0.096,-0.11,-0.01,0.0049
receiverY,0.22,-0.52,1.0,-0.68,0.44,0.044,0.044,0.078,-0.34,-0.31,-0.36
receiverAlt,-0.22,0.31,-0.68,1.0,-0.55,-0.047,-0.047,-0.099,0.35,0.32,0.37
receiverSpeed,0.28,-0.08,0.44,-0.55,1.0,0.061,0.061,0.089,-0.39,-0.35,-0.37
channelFreq,0.0013,-0.013,0.044,-0.047,0.061,1.0,1.0,-0.057,-0.15,0.37,0.36
channelBw,0.0013,-0.013,0.044,-0.047,0.061,1.0,1.0,-0.057,-0.15,0.37,0.36
channelUtil,-0.05,-0.096,0.078,-0.099,0.089,-0.057,-0.057,1.0,0.24,-0.17,-0.056
rssiMean,-0.71,-0.11,-0.34,0.35,-0.39,-0.15,-0.15,0.24,1.0,0.64,0.67
dataRateMean,-0.52,-0.01,-0.31,0.32,-0.35,0.37,0.37,-0.17,0.64,1.0,0.93


### Results discussion
* RSSI is reasonably correlated with throughput.
* Distance is reasonably negatively-correlated with throughput, as expected.
* RSSI and distance correlate pretty well.
* Channel utilization is kind of useless in this data set because there is only one client per channel.
* Data rate is pretty much a dead ringer for throughput. I'm thinking we should actually focus on predicting data rate rather than throughput. Data rate should be easier to predict as it's independent from the amount of data the client has to send.

## Let's try it as a classification problem

In [266]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def printSeparator():
  print('----------------------------------------------------------------\n')

def runClassifiers(df, targetName, scaler=None, nquantiles=4):

  # discretize target
  df[targetName] = pandas.qcut(df[targetName], q=nquantiles, labels=False)

  #  split dataset into training and test
  x = df.drop(targetName, axis=1) # x contains all the features
  y = df[targetName] # y contains only the label

  # scaling, see: https://towardsdatascience.com/scale-standardize-or-normalize-with-scikit-learn-6ccc7d176a02
  if scaler:
    x = scaler.fit_transform(x)

  # xTrain contains features for training, xTest contains features for testing
  # test_size = 0.3 means 30% data for testing
  # random_state = 1, is the seed value used by the random number generator
  xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.3, random_state = 1)

  # train models

  # Notes regarding classification report:
  #  - precision = TP / (TP + FP), or what percentage of predicted positives are true positives.
  #  - recall = TP / (TP+FN), or what percentage of actual positives were predicted to be positive.
  #  - f1-score = can be interpreted as a weighted harmonic mean of the precision and recall, 
  #               where an F-beta score reaches its best value at 1 and worst score at 0.
  #  - support = numer of occurrences of each class in yTrue.

  # logistic regression
  clfLogr = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=100000, random_state=1)
  clfLogr.fit(xTrain, yTrain)
  yPredLogr = clfLogr.predict(xTest) # predict on the unseen data
  print("Accuracy of Logistic Regression:", accuracy_score(yPredLogr, yTest))
  print(classification_report(yPredLogr, yTest))
  #print (clfLogr.coef_)
  printSeparator()
    
  # knn
  clfKnn = KNeighborsClassifier()
  clfKnn.fit(xTrain, yTrain)
  yPredKnn = clfKnn.predict(xTest) # method chainning
  print("Accuracy of KNN:", accuracy_score(yPredKnn, yTest))
  print(classification_report(yPredKnn, yTest))
  printSeparator()
    
  # random forest
  clfRf = RandomForestClassifier(n_estimators=100, random_state=1)
  clfRf.fit(xTrain, yTrain)
  yPredRf = clfRf.predict(xTest)
  print("Accuracy of Random Forest:", accuracy_score(yPredRf, yTest))
  print(classification_report(yPredRf, yTest))
  printSeparator()
    
  # decision tree    
  clfDt = DecisionTreeClassifier(random_state=1)
  yPredDt = clfDt.fit(xTrain, yTrain).predict(xTest)
  print("Accuracy of Decision Tree:", accuracy_score(yPredDt, yTest))
  print(classification_report(yPredDt, yTest))
  printSeparator()
    
  # svm
  clfSvm = svm.SVC(gamma='scale', random_state=1)
  clfSvm.fit(xTrain, yTrain)
  yPredSvm = clfSvm.predict(xTest)
  print("Accuracy of SVM:", accuracy_score(yPredSvm, yTest))
  print(classification_report(yPredSvm, yTest))    

### Let's try using the classifiers on the original features
We discretize the data rate into 4 classes corresponding to the quartiles.

In [267]:
clDf = dataset.copy() # work on copy to preserve original

# we don't care for some columns no more
clDf.drop(['senderId', 'receiverId', 'channelUtil', 'nBytesReceived'], axis=1, inplace=True)

# test drops, drop different columns to see how they affect the results
#clDf.drop(['receiverAlt', 'receiverDist'], axis=1, inplace=True)
#clDf.drop(['receiverAlt', 'receiverDist', 'receiverSpeed', 'receiverX', 'receiverY', \
#    'channelFreq', 'channelBw'], axis=1, inplace=True) # drop everything but rssiMean

#  split dataset into training and test
x = clDf.drop("dataRateMean", axis=1) # x contains all the features
y = clDf["dataRateMean"] # y contains only the label

from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler() # subtract minimum and divide by range
runClassifiers(df=clDf, targetName="dataRateMean", scaler=mms, nquantiles=4)

Accuracy of Logistic Regression: 0.7987987987987988
              precision    recall  f1-score   support

           0       0.88      0.79      0.84       447
           1       0.67      0.72      0.69       384
           2       0.76      0.87      0.81       379
           3       0.88      0.82      0.85       455

    accuracy                           0.80      1665
   macro avg       0.80      0.80      0.80      1665
weighted avg       0.81      0.80      0.80      1665

----------------------------------------------------------------

Accuracy of KNN: 0.8786786786786787
              precision    recall  f1-score   support

           0       0.91      0.87      0.89       422
           1       0.80      0.85      0.82       382
           2       0.87      0.87      0.87       435
           3       0.94      0.93      0.93       426

    accuracy                           0.88      1665
   macro avg       0.88      0.88      0.88      1665
weighted avg       0.88      0.

### Let's try using PCA to reduce dimensionality before using the classifiers
We have a lot of features, let's see if we can loose some without prejudice.

In [268]:
# separate features from target
features = ['receiverX', 'receiverY', 'receiverDist', 'receiverSpeed', 'receiverAlt', \
            'channelFreq', 'channelBw', 'rssiMean']
x = dataset.loc[:, features].values # x represents our features

# standardization, see the following articles: 
# 1. https://towardsdatascience.com/scale-standardize-or-normalize-with-scikit-learn-6ccc7d176a02
# 2. https://towardsdatascience.com/a-step-by-step-explanation-of-principal-component-analysis-b836fb9c97e2
from sklearn.preprocessing import StandardScaler
x = StandardScaler().fit_transform(x) # subtract minimum and divide by range

# run PCA
from sklearn import decomposition
pca = decomposition.PCA(.95) # maintain 95% of the variance
pca.fit(x) 

print("Found", pca.n_components_, "PCA components:", pca.explained_variance_)
print("Explained variance ratios:", pca.explained_variance_ratio_, "total:", \
      sum(pca.explained_variance_ratio_))

prinComps = pca.transform(x) # apply dimensionality reduction to x

# create a data frame out of x
clPcaDf = pandas.DataFrame(data = prinComps)
clPcaDf['dataRateMean'] =  dataset['dataRateMean']

runClassifiers(df=clPcaDf, targetName="dataRateMean", scaler=None, nquantiles=4)

Found 6 PCA components: [2.83777646 2.00023362 1.50756115 0.84120669 0.38076478 0.27464296]
Explained variance ratios: [0.35465812 0.24998414 0.18841118 0.10513188 0.04758702 0.03432418] total: 0.9800965172976589
Accuracy of Logistic Regression: 0.8126126126126126
              precision    recall  f1-score   support

           0       0.86      0.78      0.82       446
           1       0.67      0.74      0.70       373
           2       0.82      0.88      0.85       405
           3       0.90      0.85      0.87       441

    accuracy                           0.81      1665
   macro avg       0.81      0.81      0.81      1665
weighted avg       0.82      0.81      0.81      1665

----------------------------------------------------------------

Accuracy of KNN: 0.8744744744744745
              precision    recall  f1-score   support

           0       0.89      0.87      0.88       410
           1       0.80      0.84      0.82       389
           2       0.88      0.85  

## Let's do regression now

In [269]:
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.metrics import mean_squared_error, r2_score
import numpy

def rmse(yTrue, yPred):
  """
    Compute root mean square error.
  """
  return numpy.sqrt(numpy.mean(numpy.square(yTrue - yPred)))

def runRegressions(df, targetName, scaler=None):
    
  #  split dataset into training and test
  x = df.drop(targetName, axis=1) # x contains all the features
  y = df[targetName] # y contains only the label

  # scaling, see: https://towardsdatascience.com/scale-standardize-or-normalize-with-scikit-learn-6ccc7d176a02
  if scaler:
    x = scaler.fit_transform(x)

  # xTrain contains features for training, xTest contains features for testing
  # test_size = 0.3 means 30% data for testing
  # random_state = 1, is the seed value used by the random number generator
  xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.3, random_state = 1)

  # train models

  # Notes regarding classification report:
  #  - precision = TP / (TP + FP), or what percentage of predicted positives are true positives.
  #  - recall = TP / (TP+FN), or what percentage of actual positives were predicted to be positive.
  #  - f1-score = can be interpreted as a weighted harmonic mean of the precision and recall, 
  #               where an F-beta score reaches its best value at 1 and worst score at 0.
  #  - support = numer of occurrences of each class in yTrue.

  # linear regression
  regLinr = LinearRegression()
  regLinr.fit(xTrain, yTrain)
  yPredLinr = regLinr.predict(xTest) # predict on the unseen data
  print('Linear regression coefs: \n', regLinr.coef_)
  print("Root Mean Square Error: %.2f" % rmse(yTest, yPredLinr))
  print('Variance score: %.2f' % r2_score(yTest, yPredLinr)) # 1 is perfect prediction
  printSeparator()

  # svm
  regSvm = svm.SVR(gamma='scale')
  regSvm.fit(xTrain, yTrain)
  yPredSvm = regSvm.predict(xTest)
  print("Support Vector Regression")
  print("Root Mean Square Error: %.2f" % rmse(yTest, yPredSvm))
  print('Variance score: %.2f' % r2_score(yTest, yPredSvm)) # 1 is perfect prediction

### Try them regressions

In [271]:
regDf = dataset.copy() # work on copy to preserve original

# we don't care for some columns no more
regDf.drop(['senderId', 'receiverId', 'channelUtil', 'nBytesReceived'], axis=1, inplace=True)

# test drops, drop different columns to see how they affect the results
#regDf.drop(['receiverAlt', 'receiverDist'], axis=1, inplace=True)
#regDf.drop(['receiverAlt', 'receiverDist', 'receiverSpeed', 'receiverX', 'receiverY', \
#    'channelFreq', 'channelBw'], axis=1, inplace=True) # drop everything but rssiMean

#  split dataset into training and test
x = regDf.drop("dataRateMean", axis=1) # x contains all the features
y = regDf["dataRateMean"] # y contains only the label

from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler() # subtract minimum and divide by range
runRegressions(df=regDf, targetName="dataRateMean", scaler=None)

Linear regression coefs: 
 [ 4.09353922e-04 -4.23809744e-01  6.13142406e-02 -4.33479668e-01
 -5.04575844e-01 -6.21209044e+00  1.73469129e-02  1.26021888e-04
 -7.64983216e+00  0.00000000e+00  1.04329142e+00]
Root Mean Square Error: 36.32
Variance score: 0.59
----------------------------------------------------------------

Support Vector Regression
Root Mean Square Error: 56.95
Variance score: -0.01
