In [14]:
import re
import numpy as np
import math
import os
import random
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [2]:
BASE_DIR = "C:/Users/hadri/Documents/GitHub/crypto_code_detection/data/"


crypto_library_df = pd.read_json(BASE_DIR + "crypto-library/crypto_library_data.json")
crypto_competitions_df = pd.read_json(BASE_DIR + "crypto_competitions/crypto_competitions_data.json")
code_jam_df = pd.read_json(BASE_DIR + "code-jam/code-jam_data.json")
others_df = pd.read_json(BASE_DIR + "others/others_data.json")

full_df = pd.concat([crypto_library_df, crypto_competitions_df, 
                     code_jam_df, others_df], 
                    ignore_index=True)

def transform_df(df): 
    """This transform the df formed with the json into the format we want, 
    which is file_name and content as two separate columns instead of being
    unites under data"""
    df['file_name'] = [df.data[i]['file_name'] for i in range(df.shape[0])]
    df['is_header'] = [df.file_name[i].split('.')[-1][0] == 'h' for i in range(df.shape[0])]
    df['content'] = [df.data[i]['content'] for i in range(df.shape[0])]
    df.drop('data', axis=1, inplace=True)
    return(df)
    
full_df = transform_df(full_df)

In [3]:
X=full_df[['data_source', 'file_name', 'is_header', 'content']]
Y=full_df[['label']]
xTrain, xTest, yTrain, yTest = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [128]:
file = open('feature_tuple_list.txt','r') 
F=file.read()
feature_tuple_list=list(map(lambda x : tuple(x.split(';')), F.split('\n')))[:-1]

In [123]:
def build_vector(row,feature_tuple_list=None):
    if feature_tuple_list is None :
        file = open('feature_tuple_list.txt','r') 
        F=file.read()
        feature_tuple_list=list(map(lambda x : tuple(x.split(';')), F.split('\n')))[:-1]
    list_of_lines=row[3].split('\n')
    feature_counts=[0]*len(feature_tuple_list)
    
    for line in list_of_lines : 
        for i,f in enumerate(feature_tuple_list) :
            if re.search(f[1], line.lower()) : feature_counts[i]+=1
                
    return feature_counts

In [130]:
file = open('feature_tuple_list.txt','r') 
F=file.read()
feature_names=list(map(lambda x : tuple(x.split(';'))[0], F.split('\n')))

In [134]:
list_of_lines=xTrain.iloc[0][3].split('\n')

In [135]:
list_of_lines

['// Problem A',
 '',
 '#include <iostream>',
 '#include <fstream>',
 '#include <sstream>',
 '#include <string>',
 '#include <vector>',
 '#include <queue>',
 '#include <stack>',
 '#include <algorithm>',
 '#include <map>',
 '#include <cstdio>',
 '#include <cstdlib>',
 '#include <set>',
 '',
 'using namespace std;',
 '',
 'int main () {',
 '',
 '    ifstream in("A.in");',
 '    ofstream out("A.out");',
 '    int cases;',
 '    in >> cases;',
 '    long long n, A, B, C, D, x0, y0, M;',
 '    for (int c = 0; c < cases; c++) {',
 '        in >> n >> A >> B >> C >> D >> x0 >> y0 >> M;',
 '        vector <long long> XX, YY;',
 '        int X = x0;',
 '        int Y = y0;',
 '        XX.push_back(X);',
 '        YY.push_back(Y);',
 '        for (int i = 1; i <= n - 1; i++) {',
 '            X = (A * X + B) % M;',
 '            Y = (C * Y + D) % M;',
 '            XX.push_back(X);',
 '            YY.push_back(Y);',
 '        }',
 '        int count = 0;',
 '        for (int i = 0; i < XX.size()

In [132]:
xTrain

Unnamed: 0,data_source,file_name,is_header,content
4549,code-jam_,code-jam_32017_akercito_24437_0_extracted_main...,False,// Problem A\n\n#include <iostream>\n#include ...
3910,code-jam_,code-jam_32017_snguyen_24437_0_extracted_A1.cpp,False,#include <vector>\n#include <list>\n#include <...
6747,others,BAXMLParserBase.h,True,/*\n Copyright 2011 Dmitry Stadnik. All rights...
1427,code-jam_,code-jam_5224486_rng..58_5733089514881024_0_ex...,False,#include <iostream>\n#include <sstream>\n#incl...
6925,others,CaptureThread.h,True,/*********************************************...
...,...,...,...,...
9225,others,pkgi_style.h,True,#pragma once\n\n#define VITA_WIDTH 960\n#defi...
4859,code-jam_,code-jam_32002_Vasyl_24446_0_extracted_C.cpp,False,#include <iostream>\n#include <sstream>\n#incl...
3264,code-jam_,code-jam_6224486_Rezwan4029_5670465267826688_1...,False,"/*\n Rezwan_4029 , AUST\n*/\n\n#include <bi..."
9845,others,SKLabelNode+HLLabelNodeAdditions.h,True,//\n// SKLabelNode+HLLabelNodeAdditions.h\n//...


In [136]:
for i,f in enumerate(feature_tuple_list) : print(i,f)

0 ('number of >> ', ' >>')
1 ('number of ^ ', ' ^')
2 ('number of << ', ' <<')
3 ('number of mentions of key ', ' key')
4 ('number of mentions of encrypt ', ' encrypt')
5 ('number of mentions of decrypt ', ' decrypt')
6 ('number of mentions of prime ', ' prime')
7 ('number of mentions of (a)symmetric ', ' symmetric')
8 ('number of mentions of cipher ', ' cipher')
9 ('number of mentions of crypto ', ' crypto')
10 ('number of mentions of password ', ' password')
11 ('nb of pwd ', ' pwd')
12 ('number of mentions of bits ', ' bits')
13 ('nb of byte ', ' byte')
14 ('number of mentions of generator ', ' generator')
15 ('number of mentions of hash ', ' hash')
16 ('number of mentions of salt ', ' salt')
17 ('number of mentions of garlic ', ' garlic')
18 ('nb of rsa ', ' rsa')
19 ('nb of signature ', ' signature')
20 ('nb of congruent ', ' congruent')
21 ('nb of shuffle ', ' shuffle')
22 ('nb of xor ', ' xor ')
23 ('nb of permutation ', ' permutation')
24 ('nb of nonce ', ' nonce')
25 ('nb of r

In [131]:
xTrain_vec=xTrain.apply(lambda row: build_vector(row), axis=1)
xTrain_vec_df=xTrain_vec.to_frame()
xTrain_vec_df.columns=['vec']
xTrain_vec_df=xTrain_vec_df.vec.apply(pd.Series)
xTrain_vec_df.columns=feature_names
#for xgboost :
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
xTrain_vec_df.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in xTrain_vec_df.columns.values]

IndexError: ('tuple index out of range', 'occurred at index 4549')

In [6]:
model = XGBClassifier()
model.fit(xTrain_vec_df, yTrain)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [7]:
xTest_vec=xTest.apply(lambda row: build_vector(row), axis=1)
xTest_vec_df=xTest_vec.to_frame()
xTest_vec_df.columns=['vec']
xTest_vec_df=xTest_vec_df.vec.apply(pd.Series)
xTest_vec_df.columns=feature_names
#for xgboost :
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
xTest_vec_df.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in xTest_vec_df.columns.values]

In [8]:
xTest_vec_df

Unnamed: 0,number of >>,number of ^,number of __,number of mentions of key,number of mentions of encrypt,number of mentions of decrypt,number of mentions of prime,number of mentions of (a)symmetric,number of mentions of cipher,number of mentions of crypto,...,library import of cryptolib,library import of openssl,library import of gnutls,library import of cryptlib,library import of gcrypt,library import of sodium,library import of nettle,library import of nss,library import of wolfssl,library import of mbedtls
10280,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9703,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2853,2,0,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8145,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8006,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5186,2,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9649,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5271,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10597,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
yPred = model.predict(xTest_vec_df)

In [17]:
M=confusion_matrix(yTest,yPred.reshape(np.array(yTest).shape))

In [48]:
print('confusion matrix : ')
print('               ')
print('      predicted value')
print('actual', list(M[0]))
print('value ', list(M[1]))
print('              ')
print('Recall for positives = TP/(TP+FN) = ' + str(round(100*M[1][1]/(M[1][1]+M[1][0]),2)) + '%' + ' = ' + 'Conditional probability that we output 1 when the file is indeed crypto')
print('Recall for negatives = TN/(TN+FP) = '+ str(round(100*M[0][0]/(M[0][0]+M[0][1]),2)) + '%')
print('Precision for both classes = (TP+TN)/(all) = ' + str(round(100*(M[1][1]+M[0][0])/(sum([M[i][j] for i in [0,1] for j in [0,1]])),2)) + '%')

confusion matrix : 
               
      predicted value
actual [1898, 16]
value  [78, 158]
              
Recall for positives = TP/(TP+FN) = 66.95% = Conditional probability that we output 1 when the file is indeed crypto
Recall for negatives = TN/(TN+FP) = 99.16%
Precision for both classes = (TP+TN)/(all) = 95.63%


In [50]:
xTest

Unnamed: 0,data_source,file_name,is_header,content
10280,others,trace.cpp,False,/*\n Dynamic tracing and IDA integration\n ...
9703,others,SDL_stbimage.h,True,/*\n * A small header-only library to load an ...
2853,code-jam_,code-jam_2437488_InfinityBlue_2749486_0_extrac...,False,#include <iostream>\n#include <cstdio>\n#inclu...
8145,others,invert.cpp,False,#include <plugin.hpp>\n#include <output.hpp>\n...
8006,others,HPCA.h,True,\n#ifndef deepnl_HPCA_H\n#define deepnl_HPCA_H...
...,...,...,...,...
5186,code-jam_,code-jam_6224486_sts22_5686275109552128_0_extr...,False,"/*\n ""I have nothing to loose\n so,nothing t..."
9649,others,sasearch.c,False,/*\n * sasearch.c for libdivsufsort\n * Copyri...
5271,code-jam_,code-jam_1460488_sdya_1285485_0_extracted_Solu...,False,#include <iostream>\n#include <string>\n#inclu...
10597,others,WhiteRectangleDetector.cpp,False,// -*- mode:c++; tab-width:2; indent-tabs-mode...


In [51]:
yTest

Unnamed: 0,label
10280,0
9703,0
2853,0
8145,0
8006,0
...,...
5186,0
9649,0
5271,0
10597,0


In [49]:
yPred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [52]:
xTest['truelabel']=yTest

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [54]:
xTest['predlabel']=yPred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [63]:
xTest_crypto=xTest[(xTest.truelabel==1)]

In [64]:
xTest_crypto_FN=xTest_crypto[xTest_crypto.predlabel==0]

In [66]:
xTest_crypto_FN.to_csv('FN_model_1.csv')

In [67]:
xTest_crypto_FN

Unnamed: 0,data_source,file_name,is_header,content,truelabel,predlabel
124,crypto_library,files/nettle/memops.h,True,/* memops.h\n\n Copyright (C) 2016 Niels Möl...,1,0
1237,crypto_competitions,catena.c,False,#include <string.h>\n#include <stdio.h>\n#incl...,1,0
154,crypto_library,files/nettle/rsa-pss-sha512-verify.c,False,/* rsa-pss-sha512-verify.c\n\n Verifying sig...,1,0
272,crypto_library,files/nettle/chacha-set-key.c,False,/* chacha-set-key.c\n\n Copyright (C) 2014 N...,1,0
247,crypto_library,files/nettle/ecc-point.c,False,"/* ecc-point.c\n\n Copyright (C) 2013, 2014 ...",1,0
...,...,...,...,...,...,...
196,crypto_library,files/nettle/ripemd160-internal.h,True,/* ripemd160-internal.h\n\n RIPEMD-160 hash ...,1,0
1163,crypto_competitions,blake2-kat.h,True,/*\n BLAKE2 reference source code package - ...,1,0
639,crypto_library,files/libgcrypt/mac-hmac.c,False,/* mac-hmac.c - HMAC glue for MAC API\n * Co...,1,0
146,crypto_library,files/nettle/mini-gmp.h,True,"/* mini-gmp, a minimalistic implementation of ...",1,0


In [70]:
for i in range(len(xTest_crypto_FN)) :
    print('***********************')
    print('file number {}'.format(i))
    print(xTest_crypto_FN.iloc[i][3])
    print('***********************')    

***********************
file number 0
/* memops.h

   Copyright (C) 2016 Niels Möller

   This file is part of GNU Nettle.

   GNU Nettle is free software: you can redistribute it and/or
   modify it under the terms of either:

     * the GNU Lesser General Public License as published by the Free
       Software Foundation; either version 3 of the License, or (at your
       option) any later version.

   or

     * the GNU General Public License as published by the Free
       Software Foundation; either version 2 of the License, or (at your
       option) any later version.

   or both in parallel, as here.

   GNU Nettle is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received copies of the GNU General Public License and
   the GNU Lesser General Public License along with this program