In [None]:
%matplotlib inline  
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import resample

from sklearn.model_selection import cross_val_score, KFold, train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from xgboost import XGBClassifier


In [None]:
all_results = np.zeros((3,4))
for idx,npkt in enumerate([10,20,50]):
    sel_apps = ['dropbox', 'facebook', 'gmail', 'instagram', 'spotify', 'twitter', 'youtube']
    nsamples_per_class = {app: pd.read_csv('{}_{}pkt_dl.csv'.format(app,npkt)).shape[0] for app in sel_apps}
    nsample = min(nsamples_per_class.values())
    df = pd.DataFrame()
    for app in sel_apps:
        df_app = pd.read_csv('{}_{}pkt_dl.csv'.format(app,npkt))
        df_sampled = resample(df_app, replace=False, n_samples = nsample, random_state = 7)
        df = df.append(df_sampled)
    features = ['dl_iat_mean', 'dl_iat_min','dl_iat_max','dl_iat_std', 'dl_bps', 'dl_npkt_ps']
    X = df[features]
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    le = LabelEncoder()
    Y = le.fit_transform(df['app'])
    
    print('-------------')
    print('n = {}'.format(npkt))
    kfold = KFold(n_splits=5, shuffle = True, random_state=7)
    rfc = RandomForestClassifier()
    rfc_results = cross_val_score(rfc, X, Y, cv=kfold, n_jobs = kfold.n_splits)
    print("RFC Accuracy: %.2f%% (%.2f%%)" % (rfc_results.mean()*100, rfc_results.std()*100))
    all_results[idx,0] = rfc_results.mean()*100
    
    knn = KNeighborsClassifier()
    knn_results = cross_val_score(knn, X, Y, cv=kfold, n_jobs = kfold.n_splits)
    print("KNN Accuracy: %.2f%% (%.2f%%)" % (knn_results.mean()*100, knn_results.std()*100))
    all_results[idx,1] = knn_results.mean()*100

    svc = svm.SVC(decision_function_shape='ovo')
    svm_results = cross_val_score(svc, X, Y, cv=kfold, n_jobs = kfold.n_splits)
    print("SVM Accuracy: %.2f%% (%.2f%%)" % (svm_results.mean()*100, svm_results.std()*100))
    all_results[idx,2] = svm_results.mean()*100

    xgb = XGBClassifier()
    xgb_results = cross_val_score(xgb, X, Y, cv=kfold, n_jobs = kfold.n_splits)
    print("XGB Accuracy: %.2f%% (%.2f%%)" % (xgb_results.mean()*100, xgb_results.std()*100))
    all_results[idx,3] = xgb_results.mean()*100

In [None]:
def autolabel(rects, xpos='center'):
    """
    Attach a text label above each bar in *rects*, displaying its height.

    *xpos* indicates which side to place the text w.r.t. the center of
    the bar. It can be one of the following {'center', 'right', 'left'}.
    """

    xpos = xpos.lower()  # normalize the case of the parameter
    ha = {'center': 'center', 'right': 'left', 'left': 'right'}
    offset = {'center': 0.5, 'right': 0.57, 'left': 0.43}  # x_txt = x + w*off

    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()*offset[xpos], 1.01*height,
                '%.1f'% height, ha=ha[xpos], va='bottom')
   
labels = ['RFC','SVM','KNN','XGBoost']
x = np.arange(len(labels))  # the label locations
width = 0.22  # the width of the bars
fig, ax = plt.subplots()
rects1 = ax.bar(x - width, all_results[0,:], width, label='n = 10')
rects2 = ax.bar(x, all_results[1,:], width, label='n = 20')
rects3 = ax.bar(x + width, all_results[2,:], width, label='n = 50')


# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Accuracy')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()
autolabel(rects1)
autolabel(rects2)
autolabel(rects3)
fig.tight_layout()
plt.show()

In [None]:
# plotting function inspired by https://matplotlib.org/3.1.1/gallery/lines_bars_and_markers/barchart.html
def autolabel(rects, xpos='center'):
    """
    Attach a text label above each bar in *rects*, displaying its height.

    *xpos* indicates which side to place the text w.r.t. the center of
    the bar. It can be one of the following {'center', 'right', 'left'}.
    """

    xpos = xpos.lower()  # normalize the case of the parameter
    ha = {'center': 'center', 'right': 'left', 'left': 'right'}
    offset = {'center': 0.5, 'right': 0.57, 'left': 0.43}  # x_txt = x + w*off

    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()*offset[xpos], 1.01*height,
                '%.1f'% height, ha=ha[xpos], va='bottom')
reordered_results = all_results[:,np.array([0,3,1,2])]
labels = ['RFC','XGBoost','SVM','KNN']
x = np.arange(len(labels))  # the label locations
width = 0.27  # the width of the bars
fig, ax = plt.subplots()
rects1 = ax.bar(x - width, reordered_results[0,:], width, label='n = 10')
rects2 = ax.bar(x, reordered_results[1,:], width, label='n = 20')
rects3 = ax.bar(x + width, reordered_results[2,:], width, label='n = 50')


# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Accuracy (%)')
ax.set_ylim([0,95])
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()
autolabel(rects1)
autolabel(rects2)
autolabel(rects3)
fig.tight_layout()
plt.savefig('Classification_7apps_10_20_50pkt_cross_eval_4models.eps', format='eps', dpi=300)


In [None]:
npkt = 50
sel_apps = ['Dropbox', 'Facebook', 'Gmail', 'Instagram', 'Spotify', 'Twitter', 'YouTube']
nsamples_per_class = {app: pd.read_csv('{}_{}pkt_dl.csv'.format(app,npkt)).shape[0] for app in sel_apps}
nsample = min(nsamples_per_class.values())
df = pd.DataFrame()
for app in sel_apps:
    df_app = pd.read_csv('{}_{}pkt_dl.csv'.format(app,npkt))
    df_sampled = resample(df_app, replace=False, n_samples = nsample, random_state = 7)
    df = df.append(df_sampled)
print(df.shape[0])
features = ['dl_iat_mean', 'dl_iat_min','dl_iat_max','dl_iat_std', 'dl_bps', 'dl_npkt_ps']
X = df[features]
scaler = StandardScaler()
X = scaler.fit_transform(X)
le = LabelEncoder()
Y = le.fit_transform(df['app'])
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=7)
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)

plt.rcParams.update({'font.size': 16})
fig, ax = plt.subplots(figsize=(10,10))
plot_confusion_matrix(rfc,X_test,y_test,display_labels = sel_apps, cmap=plt.cm.Blues, 
                      normalize='true',xticks_rotation='vertical',values_format='.2g', ax=ax)
# plt.show()
plt.savefig('Classification_7apps_50pkt_temporal_features_confusion_matrix.eps', format='eps', dpi=300)

In [None]:
npkt = 50
sel_apps = ['Dropbox', 'Facebook', 'Gmail', 'Instagram', 'Spotify', 'Twitter', 'YouTube']
nsamples_per_class = {app: pd.read_csv('{}_{}pkt_dl.csv'.format(app,npkt)).shape[0] for app in sel_apps}
nsample = min(nsamples_per_class.values())
df = pd.DataFrame()
for app in sel_apps:
    df_app = pd.read_csv('{}_{}pkt_dl.csv'.format(app,npkt))
    df_sampled = resample(df_app, replace=False, n_samples = nsample, random_state = 7)
    df = df.append(df_sampled)
print(df.shape[0])
features = ['dl_iat_mean', 'dl_iat_min','dl_iat_max','dl_iat_std', 'dl_bps', 'dl_npkt_ps']
X = df[features]
scaler = StandardScaler()
X = scaler.fit_transform(X)
le = LabelEncoder()
Y = le.fit_transform(df['app'])
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=7)
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
fig, ax = plt.subplots(figsize=(6,6))
plot_confusion_matrix(rfc,X_test,y_test,display_labels = sel_apps, cmap=plt.cm.Blues, 
                      normalize='true',xticks_rotation='vertical',values_format='.2g', ax=ax)
# plt.show()
plt.savefig('Classification_7apps_50pkt_temporal_features_confusion_matrix.eps', format='eps', dpi=300, transparent=False)

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
plot_confusion_matrix(rfc,X_test,y_test,display_labels = sel_apps, cmap=plt.cm.Blues, 
                      normalize='true',xticks_rotation='vertical',values_format='.2g', ax=ax)
# plt.show()
plt.savefig('Classification_7apps_50pkt_temporal_features_confusion_matrix.pdf', format='pdf', dpi=300)

In [None]:
plt.rcParams.update({'font.size': 16})
fig, ax = plt.subplots(figsize=(10,10))
plot_confusion_matrix(rfc,X_test,y_test,display_labels = sel_apps, cmap=plt.cm.Blues, 
                      normalize='true',xticks_rotation='vertical',values_format='.2g', ax=ax)
plt.savefig('Classification_7apps_50pkt_temporal_features_confusion_matrix.eps', format='eps', dpi=300, bbox_inches='tight')

In [None]:
plt.rcParams