In [3]:
"""
@author: Yuqiang (Ethan) Heng
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import resample

from sklearn.model_selection import cross_val_score, KFold, train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import plot_confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from xgboost import XGBClassifier

df_all = pd.read_csv('./Processed Data/random_scenario_bi_flow_features.csv',low_memory=False).dropna().drop('Unnamed: 0',axis=1)

dl_features = [f for f in df_all.columns.values if f.split('_')[0] == 'dl']
ul_features = [f for f in df_all.columns.values if f.split('_')[0] == 'ul']
features = np.concatenate((dl_features,ul_features))

print('------ action classification -------')
for app_name in df_all['app'].unique():
    df = df_all[df_all['app']==app_name]
    if len(df['action'].unique())==1:
        continue
    print('------ {} -------'.format(app_name))
    X = df[dl_features]
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    le = LabelEncoder()
    Y = le.fit_transform(df['action'])
    
    kfold = KFold(n_splits=10, shuffle = True, random_state=7)
    rfc = RandomForestClassifier()
    rfc_results = cross_val_score(rfc, X, Y, cv=kfold, n_jobs = kfold.n_splits)
    print("RFC Accuracy: %.2f%% (%.2f%%)" % (rfc_results.mean()*100, rfc_results.std()*100))
    
    knn = KNeighborsClassifier()
    knn_results = cross_val_score(knn, X, Y, cv=kfold, n_jobs = kfold.n_splits)    
    print("KNN Accuracy: %.2f%% (%.2f%%)" % (knn_results.mean()*100, knn_results.std()*100))
    
    xgb = XGBClassifier()
    xgb_results = cross_val_score(xgb, X, Y, cv=kfold, n_jobs = kfold.n_splits)
    print("XGB Accuracy: %.2f%% (%.2f%%)" % (xgb_results.mean()*100, xgb_results.std()*100))

------ action classification -------
------ dropbox -------
RFC Accuracy: 81.55% (5.34%)
KNN Accuracy: 74.42% (8.10%)
XGB Accuracy: 81.27% (8.12%)
------ facebook -------
RFC Accuracy: 65.63% (9.63%)
KNN Accuracy: 70.80% (8.37%)
XGB Accuracy: 62.81% (11.84%)
------ gmail -------
RFC Accuracy: 81.05% (8.22%)
KNN Accuracy: 78.42% (7.61%)
XGB Accuracy: 82.63% (10.27%)
------ google-drive -------
RFC Accuracy: 73.55% (10.96%)
KNN Accuracy: 66.09% (5.58%)
XGB Accuracy: 67.91% (10.07%)
------ hulu -------
RFC Accuracy: 93.89% (3.52%)
KNN Accuracy: 91.11% (4.57%)
XGB Accuracy: 92.64% (3.46%)
------ instagram -------
RFC Accuracy: 98.27% (2.65%)
KNN Accuracy: 97.22% (3.73%)
XGB Accuracy: 95.95% (5.89%)
------ netflix -------
RFC Accuracy: 84.96% (6.94%)
KNN Accuracy: 80.22% (8.78%)
XGB Accuracy: 87.27% (7.67%)
------ pandora -------
RFC Accuracy: 73.34% (4.35%)
KNN Accuracy: 68.28% (4.01%)
XGB Accuracy: 73.26% (4.36%)
------ reddit -------
RFC Accuracy: 69.83% (6.41%)
KNN Accuracy: 70.20% (7.1