In [105]:
import re
import numpy as np
import math
import os
import random
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [7]:
BASE_DIR = "C:/Users/hadri/Documents/GitHub/crypto_code_detection/data/"


crypto_library_df = pd.read_json(BASE_DIR + "crypto-library/crypto_library_data.json")
crypto_competitions_df = pd.read_json(BASE_DIR + "crypto_competitions/crypto_competitions_data.json")
code_jam_df = pd.read_json(BASE_DIR + "code-jam/code-jam_data.json")
others_df = pd.read_json(BASE_DIR + "others/others_data.json")

full_df = pd.concat([crypto_library_df, crypto_competitions_df, 
                     code_jam_df, others_df], 
                    ignore_index=True)

def transform_df(df): 
    """This transform the df formed with the json into the format we want, 
    which is file_name and content as two separate columns instead of being
    unites under data"""
    df['file_name'] = [df.data[i]['file_name'] for i in range(df.shape[0])]
    df['is_header'] = [df.file_name[i].split('.')[-1][0] == 'h' for i in range(df.shape[0])]
    df['content'] = [df.data[i]['content'] for i in range(df.shape[0])]
    df.drop('data', axis=1, inplace=True)
    return(df)
    
full_df = transform_df(full_df)

In [14]:
X=full_df[['data_source', 'file_name', 'is_header', 'content']]
Y=full_df[['label']]
xTrain, xTest, yTrain, yTest = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [68]:
def build_vector(row):
            
    list_of_lines=row[3].split('\n')

    a=['number of >>','number of ^','number of <<','number of mentions of key',
               'number of mentions of encrypt','number of mentions of decrypt','number of mentions of prime'
              ,'number of mentions of (a)symmetric', 'number of mentions of cipher', 'number of mentions of crypto', 'number of mentions of password'
              ,'number of mentions of bits','number of mentions of generator'
              , 'library import of cryptolib', 'library import of openssl', 'library import of gnutls' ,'library import of cryptlib' 
               ,'library import of gcrypt' ,'library import of sodium' , 'library import of nettle' , 'library import of nss'
               , 'library import of wolfssl','library import of mbedtls']



    DD,N,CC,key,encrypt,decrypt,prime,sym,cipher,crypto,passwrd,bits,generator=0,0,0,0,0,0,0,0,0,0,0,0,0
    cryptolib,openssl, gnutls,cryptlib,gcrypt,sodium,nettle,nss,wolfssl,mbedtls=0,0,0,0,0,0,0,0,0,0
    for line in list_of_lines :
                if '>>' in line : DD+=1
                if '^' in line : N+=1
                if '<<' in line : CC+=1
                if 'key' in line : key+=1
                if 'encrypt' in line : encrypt+=1
                if 'decrypt'  in line : decrypt +=1
                if 'prime' in line : prime +=1
                if 'symmetric' in line : sym+=1
                if 'cipher' in line : cipher+=1
                if 'crypto' in line : crypto +=1
                if 'password' in line : passwrd +=1 
                if 'bits' in line : bits +=1 
                if 'generator' in line : generator +=1 
                #Library imports in C and C++
                if re.search('include..crypto', line) : cryptolib+=1
                if re.search('include..openssl', line) : openssl+=1
                if re.search('include..gnutls', line)  : gnutls+=1
                if re.search('include..cryptlib', line)  : cryptlib+=1
                if re.search('include..gcrypt', line)  : gcrypt+=1
                if re.search('include..sodium', line) : sodium+=1
                if re.search('include..nettle', line)  : nettle+=1
                if re.search('include..nss', line)  : nss+=1
                if re.search('include..wolfssl', line)  : wolfssl+=1
                if re.search('include..mbedtls', line)  : mbedtls+=1
    

    b=[DD,N,CC,key,encrypt,decrypt,prime,sym,cipher,crypto,passwrd,bits,generator, cryptolib,openssl, gnutls,cryptlib,gcrypt
               ,sodium,nettle,nss,wolfssl,mbedtls]
    
    
    return b

In [82]:
xTrain_vec=xTrain.apply(lambda row: build_vector(row), axis=1)
xTrain_vec_df=xTrain_vec.to_frame()
xTrain_vec_df.columns=['vec']
xTrain_vec_df=xTrain_vec_df.vec.apply(pd.Series)
a=['number of >>','number of ^','number of <<','number of mentions of key',
               'number of mentions of encrypt','number of mentions of decrypt','number of mentions of prime'
              ,'number of mentions of (a)symmetric', 'number of mentions of cipher', 'number of mentions of crypto', 'number of mentions of password'
              ,'number of mentions of bits','number of mentions of generator'
              , 'library import of cryptolib', 'library import of openssl', 'library import of gnutls' ,'library import of cryptlib' 
               ,'library import of gcrypt' ,'library import of sodium' , 'library import of nettle' , 'library import of nss'
               , 'library import of wolfssl','library import of mbedtls']
xTrain_vec_df.columns=a
#for xgboost :
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
xTrain_vec_df.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in xTrain_vec_df.columns.values]

In [97]:
model = XGBClassifier()
model.fit(xTrain_vec_df, yTrain)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [112]:
xTest_vec=xTest.apply(lambda row: build_vector(row), axis=1)
xTest_vec_df=xTest_vec.to_frame()
xTest_vec_df.columns=['vec']
xTest_vec_df=xTest_vec_df.vec.apply(pd.Series)
a=['number of >>','number of ^','number of <<','number of mentions of key',
               'number of mentions of encrypt','number of mentions of decrypt','number of mentions of prime'
              ,'number of mentions of (a)symmetric', 'number of mentions of cipher', 'number of mentions of crypto', 'number of mentions of password'
              ,'number of mentions of bits','number of mentions of generator'
              , 'library import of cryptolib', 'library import of openssl', 'library import of gnutls' ,'library import of cryptlib' 
               ,'library import of gcrypt' ,'library import of sodium' , 'library import of nettle' , 'library import of nss'
               , 'library import of wolfssl','library import of mbedtls']
xTest_vec_df.columns=a
#for xgboost :
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
xTest_vec_df.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in xTest_vec_df.columns.values]

In [114]:
xTest_vec_df

Unnamed: 0,number of >>,number of ^,number of __,number of mentions of key,number of mentions of encrypt,number of mentions of decrypt,number of mentions of prime,number of mentions of (a)symmetric,number of mentions of cipher,number of mentions of crypto,...,library import of cryptolib,library import of openssl,library import of gnutls,library import of cryptlib,library import of gcrypt,library import of sodium,library import of nettle,library import of nss,library import of wolfssl,library import of mbedtls
10280,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9703,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2853,2,0,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8145,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8006,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5186,2,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9649,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5271,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10597,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [113]:
yPred = model.predict(xTest_vec_df)

In [117]:
accuracy = accuracy_score(yPred.reshape(np.array(yTest).shape), yTest)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 95.63%
