In [25]:
import re
import numpy as np
import math
import os
import random
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.model_selection import GridSearchCV

In [13]:
BASE_DIR = "C:/Users/hadri/Documents/GitHub/crypto_code_detection/data/"


crypto_library_df = pd.read_json(BASE_DIR + "crypto-library/crypto_library_data.json")
crypto_competitions_df = pd.read_json(BASE_DIR + "crypto_competitions/crypto_competitions_data.json")
code_jam_df = pd.read_json(BASE_DIR + "code-jam/code-jam_data.json")
others_df = pd.read_json(BASE_DIR + "others/others_data.json")

full_df = pd.concat([crypto_library_df, crypto_competitions_df, 
                     code_jam_df, others_df], 
                    ignore_index=True)

def transform_df(df): 
    """This transform the df formed with the json into the format we want, 
    which is file_name and content as two separate columns instead of being
    unites under data"""
    df['file_name'] = [df.data[i]['file_name'] for i in range(df.shape[0])]
    df['is_header'] = [df.file_name[i].split('.')[-1][0] == 'h' for i in range(df.shape[0])]
    df['content'] = [df.data[i]['content'] for i in range(df.shape[0])]
    df.drop('data', axis=1, inplace=True)
    return(df)
    
full_df = transform_df(full_df)

In [14]:
X=full_df[['data_source', 'file_name', 'is_header', 'content']]
Y=full_df[['label']]
xTrain, xTest, yTrain, yTest = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [15]:
file = open('feature_tuple_list.txt','r') 
F=file.read()
feature_tuple_list=list(map(lambda x : tuple(x.split(';')), F.split('\n')))[:-1] #remove trailing line
feature_names=list(map(lambda x : x[0].strip() , feature_tuple_list))

In [16]:
def build_vector(row,feature_tuple_list=None):
    if feature_tuple_list is None :
        file = open('feature_tuple_list.txt','r') 
        F=file.read()
        feature_tuple_list=list(map(lambda x : tuple(x.split(';')), F.split('\n')))[:-1] #remove trailing line
    list_of_lines=row[3].split('\n')
    feature_counts=[0]*len(feature_tuple_list)
    feature_list=[f[1] for f in feature_tuple_list]
    for line in list_of_lines : 
        for i,f in enumerate(feature_list) :
            #need to strip to remove spaces + lowercase to match on our lowercase expression
            if re.search(f.strip(), line.lower()) : feature_counts[i]+=len(re.findall(f.strip(), line.lower()))
                
    return feature_counts

In [17]:
xTrain_vec=xTrain.apply(lambda row: build_vector(row), axis=1)
xTrain_vec_df=xTrain_vec.to_frame()
xTrain_vec_df.columns=['vec']
xTrain_vec_df=xTrain_vec_df.vec.apply(pd.Series)
xTrain_vec_df.columns=feature_names
#for xgboost :
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
xTrain_vec_df.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in xTrain_vec_df.columns.values]

In [39]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2),
 'n_estimators': range(10,200,10)
 
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( max_depth=5, learning_rate=0.2,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='recall',iid=False, cv=5, verbose=1)
gsearch1.fit(xTrain_vec_df, yTrain)

Fitting 5 folds for each of 228 candidates, totalling 1140 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done 1140 out of 1140 | elapsed: 61.0min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=0.8, gamma=0,
                                     learning_rate=0.2, max_delta_step=0,
                                     max_depth=5, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=27, silent=None,
                                     subsample=0.8, verbosity=1),
             iid=False, n_jobs=None,
             param_grid={'max_depth': range(3, 10, 2),
                         'min_child_weight': range(1, 6, 2),
                         'n_estimators': range(10,

In [18]:
model = XGBClassifier()
model.fit(xTrain_vec_df, yTrain)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [19]:
xTest_vec=xTest.apply(lambda row: build_vector(row), axis=1)
xTest_vec_df=xTest_vec.to_frame()
xTest_vec_df.columns=['vec']
xTest_vec_df=xTest_vec_df.vec.apply(pd.Series)
xTest_vec_df.columns=feature_names
#for xgboost :
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
xTest_vec_df.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in xTest_vec_df.columns.values]

In [20]:
yPred = model.predict(xTest_vec_df)

In [21]:
M=confusion_matrix(yTest,yPred.reshape(np.array(yTest).shape))

In [27]:
print('confusion matrix : ')
print('               ')
print('      predicted value')
print('actual', list(M[0]))
print('value ', list(M[1]))
print('              ')
print('Recall for positives = TP/(TP+FN) = ' + str(round(100*M[1][1]/(M[1][1]+M[1][0]),2)) + '%' + ' = ' + 'Conditional probability that we output 1 when the file is indeed crypto')
print('Recall for negatives = TN/(TN+FP) = '+ str(round(100*M[0][0]/(M[0][0]+M[0][1]),2)) + '%')
print('Precision for both classes = (TP+TN)/(all) = ' + str(round(100*(M[1][1]+M[0][0])/(sum([M[i][j] for i in [0,1] for j in [0,1]])),2)) + '%')

confusion matrix : 
               
      predicted value
actual [1896, 18]
value  [61, 175]
              
Recall for positives = TP/(TP+FN) = 74.15% = Conditional probability that we output 1 when the file is indeed crypto
Recall for negatives = TN/(TN+FP) = 99.06%
Precision for both classes = (TP+TN)/(all) = 96.33%


In [63]:
xTest_crypto=xTest[(xTest.truelabel==1)]

In [64]:
xTest_crypto_FN=xTest_crypto[xTest_crypto.predlabel==0]

In [66]:
xTest_crypto_FN.to_csv('FN_model_1.csv')

In [67]:
xTest_crypto_FN

Unnamed: 0,data_source,file_name,is_header,content,truelabel,predlabel
124,crypto_library,files/nettle/memops.h,True,/* memops.h\n\n Copyright (C) 2016 Niels Möl...,1,0
1237,crypto_competitions,catena.c,False,#include <string.h>\n#include <stdio.h>\n#incl...,1,0
154,crypto_library,files/nettle/rsa-pss-sha512-verify.c,False,/* rsa-pss-sha512-verify.c\n\n Verifying sig...,1,0
272,crypto_library,files/nettle/chacha-set-key.c,False,/* chacha-set-key.c\n\n Copyright (C) 2014 N...,1,0
247,crypto_library,files/nettle/ecc-point.c,False,"/* ecc-point.c\n\n Copyright (C) 2013, 2014 ...",1,0
...,...,...,...,...,...,...
196,crypto_library,files/nettle/ripemd160-internal.h,True,/* ripemd160-internal.h\n\n RIPEMD-160 hash ...,1,0
1163,crypto_competitions,blake2-kat.h,True,/*\n BLAKE2 reference source code package - ...,1,0
639,crypto_library,files/libgcrypt/mac-hmac.c,False,/* mac-hmac.c - HMAC glue for MAC API\n * Co...,1,0
146,crypto_library,files/nettle/mini-gmp.h,True,"/* mini-gmp, a minimalistic implementation of ...",1,0
