In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn import cross_validation
from sklearn import tree
from sklearn.cross_validation import cross_val_score
from os.path import join as PJOIN
import os
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict
from sklearn.cluster import KMeans



In [51]:
DATA_DIR = "DATA/GENERATED/TRAIN/"
DATA_FILES = ["train_libpng_calint.csv", "train_dealii_calint.csv", "train_server_calint.csv", "handcrafted.csv"]

In [52]:
def get_all_training_data():
    all_files = []
    if DATA_FILES[0] == 'all':
        for file in os.listdir(DATA_DIR):
            if file[:2] == 'X_':
                all_files.append(file[2:])
    else:
        all_files = DATA_FILES
    
    all_x = []
    all_y = []
    for file in all_files:
        train_x = pd.read_csv(PJOIN(DATA_DIR,"X_"+file),header=None)
        all_x.append(np.array(train_x))
        train_y = pd.read_csv(PJOIN(DATA_DIR,"Y_"+file),header=None)
        all_y.append(train_y)
    
    all_x = np.concatenate(all_x)
    all_y = np.concatenate(all_y)
    print(all_x.shape,all_y.shape)
    all_y = all_y.reshape(all_y.shape[0])    
    return all_x, all_y

In [53]:
def normalize_data(x):
    return (x - np.mean(x,axis=0))/np.std(x,axis=0)

In [54]:
train_x, train_y = get_all_training_data()
train_x = normalize_data(train_x)
print(train_x.shape, train_y.shape)
print(np.sum(train_y==1), np.sum(train_y==2), np.sum(train_y==3))

(3809, 12) (3809, 1)
(3809, 12) (3809,)
1483 401 1925


In [113]:
def runSVM(x, y):
    print("Running SVM....")
    x_train, x_test, y_train, y_test = cross_validation.train_test_split(x, y, test_size=0.1, random_state=0)
    clf = svm.SVC(kernel='linear', C=1,max_iter=1000000).fit(x_train, y_train)
    return clf.predict(x_test), y_test

In [112]:
preds, test = runSVM(train_x,train_y)
print(precision_recall_fscore_support(test, preds))
print(accuracy_score(test,preds))
print(np.sum(test==1), np.sum(test==2), np.sum(test==3))
print(np.sum(preds==1), np.sum(preds==2), np.sum(preds==3))

Running SVM....




ValueError: too many values to unpack (expected 2)

In [12]:
def runDecisionTree(x, y):
    print ("Running Decision Tree....")
    x_train, x_test, y_train, y_test = cross_validation.train_test_split(x, y, test_size=0.1, random_state=0)
    depth = []
    for i in range(3,20):
        clf = tree.DecisionTreeClassifier(max_depth=i)
        scores = cross_val_score(estimator=clf, X=x, y=y, cv=9, n_jobs=4)
        depth.append((i,scores.mean()))
        print("Depth: ",i,"Score: ",depth[-1])
    #print(depth)

In [13]:
runDecisionTree(train_x,train_y)

Running Decision Tree....
Depth:  3 Score:  (3, 0.7932195567574175)
Depth:  4 Score:  (4, 0.7977316840052764)
Depth:  5 Score:  (5, 0.7882743870724094)
Depth:  6 Score:  (6, 0.7862990785424604)
Depth:  7 Score:  (7, 0.785309900449658)
Depth:  8 Score:  (8, 0.7757166094238281)
Depth:  9 Score:  (9, 0.7776888871280684)
Depth:  10 Score:  (10, 0.7745785572428616)
Depth:  11 Score:  (11, 0.7745837635381051)
Depth:  12 Score:  (12, 0.7768405420126813)
Depth:  13 Score:  (13, 0.7779707354180116)
Depth:  14 Score:  (14, 0.7783958981742659)
Depth:  15 Score:  (15, 0.7789665440470219)
Depth:  16 Score:  (16, 0.7796742535698135)
Depth:  17 Score:  (17, 0.7761450390376046)
Depth:  18 Score:  (18, 0.7723416853988806)
Depth:  19 Score:  (19, 0.7750239890447346)


In [103]:
def run_ann(x,y):
    print("Running ANN....")
    x_train, x_test, y_train, y_test = cross_validation.train_test_split(x, y, test_size=0.1, random_state=0)
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                        hidden_layer_sizes=(20, 8), random_state=1)
    clf.fit(x_train, y_train)
    return clf.predict(x_test), y_test

In [104]:
preds, test = run_ann(train_x, train_y)
print(precision_recall_fscore_support(test, preds))
print(accuracy_score(test,preds))
print(np.sum(test==1), np.sum(test==2), np.sum(test==3))
print(np.sum(preds==1), np.sum(preds==2), np.sum(preds==3))

Running ANN....
(array([0.80263158, 1.        , 0.68421053]), array([0.97181373, 0.31428571, 0.23214286]), array([0.87915743, 0.47826087, 0.34666667]), array([816,  35, 224]))
0.7962790697674419
816 35 224
988 11 76


In [70]:
np.sum(test==1)

773

In [16]:
def runSVM_CV(x, y):
    print("Running SVM....")
    perm = np.random.permutation(len(x))
    x = x[perm]
    y = y[perm]
    clf = svm.SVC(kernel='linear', C=1,max_iter=1000000)
    return cross_val_predict(clf, x, y,cv=5),y 

In [17]:
preds, test = runSVM_CV(train_x, train_y)
print(precision_recall_fscore_support(test, preds))
print(accuracy_score(test,preds))
print(np.sum(test==1), np.sum(test==2), np.sum(test==3))
print(np.sum(preds==1), np.sum(preds==2), np.sum(preds==3))

Running SVM....
(array([0.58740301, 0.98461538, 0.71989697]), array([0.86783547, 0.159601  , 0.58077922]), array([0.7005988 , 0.27467811, 0.64289822]), array([1483,  401, 1925]))
0.6482016277238121
1483 401 1925
2191 65 1553


In [14]:
def run_ann_CV(x,y):
    print("Running ANN....")
    perm = np.random.permutation(len(x))
    x = x[perm]
    y = y[perm]
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                        hidden_layer_sizes=(20, 8), random_state=1)
    return cross_val_predict(clf, x, y,cv=5),y 

In [15]:
preds, test = run_ann_CV(train_x, train_y)
print(precision_recall_fscore_support(test, preds))
print(accuracy_score(test,preds))
print(np.sum(test==1), np.sum(test==2), np.sum(test==3))
print(np.sum(preds==1), np.sum(preds==2), np.sum(preds==3))

Running ANN....
(array([0.66391022, 0.65326633, 0.73656755]), array([0.75792313, 0.32418953, 0.73350649]), array([0.70780856, 0.43333333, 0.73503384]), array([1483,  401, 1925]))
0.699921239170386
1483 401 1925
1693 199 1917


In [31]:
preds_svm, test_svm = runSVM_CV(train_x, train_y)
pr_svm = precision_recall_fscore_support(test_svm, preds_svm)
mpr_svm = precision_recall_fscore_support(test_svm, preds_svm,average='micro')
preds_ann, test_ann = run_ann_CV(train_x, train_y)
pr_ann = precision_recall_fscore_support(test_ann, preds_ann)
mpr_ann = precision_recall_fscore_support(test_ann, preds_ann,average='micro')

Running SVM....
Running ANN....


In [32]:
import csv
OUTPUT_FILE_NAME = "ANALYSIS/summary_cal_handcrafted.csv"
with open(OUTPUT_FILE_NAME,'w') as f:
    writer = csv.writer(f)
    writer.writerow(["Feature","SVM","ANN"])
    writer.writerow([])
    writer.writerow(["Num True Label 1",np.sum(test_svm==1),np.sum(test_ann==1)])
    writer.writerow(["Num True Label 2",np.sum(test_svm==2),np.sum(test_ann==2)])
    writer.writerow(["Num True Label 3",np.sum(test_svm==3),np.sum(test_ann==3)])
    writer.writerow([])
    writer.writerow(["Num Predicted Label 1",np.sum(preds_svm==1),np.sum(preds_ann==1)])
    writer.writerow(["Num Predicted Label 2",np.sum(preds_svm==2),np.sum(preds_ann==2)])
    writer.writerow(["Num Predicted Label 3",np.sum(preds_svm==3),np.sum(preds_ann==3)])
    writer.writerow([])
    writer.writerow(["Accuracy",accuracy_score(test_svm,preds_svm),accuracy_score(test_ann, preds_ann)])
    writer.writerow(["Micro Precision",mpr_svm[0],mpr_ann[0]])
    writer.writerow(["Micro Recall",mpr_svm[1],mpr_ann[1]])    
    writer.writerow(["Micro F1",mpr_svm[2],mpr_ann[2]])  
    writer.writerow([])
    writer.writerow(["Precision for Label 1",pr_svm[0][0],pr_ann[0][0]])
    writer.writerow(["Precision for Label 2",pr_svm[1][0],pr_ann[1][0]])    
    writer.writerow(["Precision for Label 3",pr_svm[2][0],pr_ann[2][0]])    
    writer.writerow([])
    writer.writerow(["Recall for Label 1",pr_svm[0][1],pr_ann[0][1]])
    writer.writerow(["Recall for Label 2",pr_svm[1][1],pr_ann[1][1]])    
    writer.writerow(["Recall for Label 3",pr_svm[2][1],pr_ann[2][1]])
    writer.writerow([])
    writer.writerow(["F1 for Label 1",pr_svm[0][2],pr_ann[0][2]])
    writer.writerow(["F1 for Label 2",pr_svm[1][2],pr_ann[1][2]])    
    writer.writerow(["F1 for Label 3",pr_svm[2][2],pr_ann[2][2]])

# Clustering

In [55]:
kmeans = KMeans(n_clusters=4, random_state=0).fit(train_x)

In [56]:
lbls = kmeans.labels_

In [57]:
np.sum(lbls==0), np.sum(lbls==1), np.sum(lbls==2), np.sum(lbls==3)

(3649, 2, 7, 151)