In [1]:
import os
import sys

import sklearn
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

%matplotlib inline

In [2]:
import shutil
from threading import Thread

from tqdm import tqdm

__dir__ = "/home/justpainm8/notebooks/ml/code_classif/"


def get_language_files(path, ext={"cpp": ("cpp", "hpp"), "py": ("py",), "pas": ("pas",)}):
    counts = {ex: 0 for ex in ext.keys()}
    for root, dirs, files in tqdm(os.walk(path)):
        for f in files:
            exdir = f.split('.')[-1]
            if exdir not in ext:
                continue
            for e in ext[exdir]:
                if f.endswith(e):
                    counts[exdir] += 1
                    shutil.copy(os.path.join(root, f), os.path.join(__dir__, exdir))
                    break
    return counts

In [3]:
counts = get_language_files("/home/justpainm8/notebooks/ml/code_classif/code")
counts

11636it [00:02, 4461.04it/s]


{'cpp': 14010, 'pas': 3502, 'py': 3603}

In [4]:
files = {ext: [] for ext in counts}
for fdir in counts:
    print(fdir)
    files[fdir] = os.listdir(fdir)
files

py
cpp
pas


{'cpp': ['t_5_007.cpp',
  'test_assert_fail_is_empty.cpp',
  'remove_varargs.cpp',
  'test_equal.cpp',
  't_2_004.cpp',
  'named_mutex_test.cpp',
  'log-formatter-test.cpp',
  'test_arithmetic_tommath.cpp',
  'perf_saxpy.cpp',
  'multifunction.cpp',
  'filt_attr.cpp',
  'perf_stl_set_union.cpp',
  'MsmComposite.cpp',
  'vector10_c.cpp',
  'test_has_template.cpp',
  'bfs-example.cpp',
  'test_assert_is_tuple.cpp',
  'mp_transform_q.cpp',
  'make_functions_test.cpp',
  'is_output_streamable_test.cpp',
  'comp_ellint_2.cpp',
  'books.cpp',
  'github_15.cpp',
  'lambda_tests1b2p.cpp',
  'test_erasure.cpp',
  'length_geo.cpp',
  'decl_exit_static_inv_none.cpp',
  'array_fail_spa_wpa_ma.cpp',
  'test_7868.cpp',
  'no_cxx17_inline_variables_pass.cpp',
  'test_shared_mutex_timed_locks.cpp',
  '11166-remove-race.cpp',
  'no_cxx98_binders_pass.cpp',
  'assign_points.cpp',
  'sf_sin_pi_incl_test.cpp',
  'EumlInternal.cpp',
  'test_typeof2.cpp',
  'unique_copy.cpp',
  'mpl_interop_test3.cpp',
  'a

In [5]:
def make_data_list(files):
    text, labels = [], []
    for f in files:
        text.append(np.array(files[f]))
        labels.append(np.array([f] * len(files[f])))
    return np.hstack(text), np.hstack(labels)


def as_one_hot(x, classes=sorted(list(counts.keys()))):
    return [1 if x == c else 0 for c in classes]


def as_class_label(x, classes=sorted(list(counts.keys()))):
    return np.array(as_one_hot(x, classes)).argmax()


def from_label_to_class(label, classes=sorted(list(counts.keys()))):
    return classes[int(label)]



data = make_data_list(files)
df = pd.DataFrame(index=None, columns=['source', 'target'])
df['source'], df['target'] = data
df['target'] = df['target'].apply(lambda x: as_class_label(x))
df

Unnamed: 0,source,target
0,datastructures.py,2
1,views_broken.py,2
2,darwin-4.2.1.py,2
3,compat.py,2
4,back_reference.py,2
5,deconstruct.py,2
6,exit_status.py,2
7,test_dictsortreversed.py,2
8,core_varnames.py,2
9,build.py,2


In [6]:
class FileLoader(object):
    
    def __init__(self, n, path=__dir__):
        self.n = n
        self.path = path
        self.tq = tqdm(range(n))
        
    def load(self, filename):
        path = os.path.join(self.path, filename.split('.')[-1])
        with open(os.path.join(path, filename), 'r', encoding='utf-8', errors="ignore") as f:
            self.tq.update()
            return "\n".join(f.readlines())
loader = FileLoader(len(df['source'].values))
df['source'] = df['source'].apply(loader.load)

 97%|█████████▋| 14418/14815 [00:01<00:00, 7081.82it/s] 

In [7]:
import xgboost as xgb
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer


XGB_params = dict(
    n_estimators=128,
    n_jobs=8,
)

In [9]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    df['source'].values, 
    df['target'].values, 
    test_size=0.3
)

In [12]:
from sklearn.pipeline import Pipeline


text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', xgb.XGBClassifier(**XGB_params))
])

In [13]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

In [14]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_train, text_clf.predict(X_train)))

0.9996142719382836


  if diff:


In [15]:
from sklearn.externals import joblib
s = joblib.dump(text_clf, 'xgb_text_clf_cpp_py_pas_0.99.pkl')

In [17]:
py_code = '''import os
if __name__ == "__main__":
    print(*range(10), sep='faggot')'''
cpp_code = '''#include <iostream>
<class T>
T doubleIt(T t) {
    return t * 2;
}

void main() {
    std::cout << "Fuck you" << " and also doubleIt(10) = " << doubleIt(10);
    return 0;
}
'''
pas_code = """const N = 4;
var
  f1: text;
  j,i,k,l,endline : integer;
  
begin
assign(f1,'D:\in.txt');
reset(f1);
  while not Eoln(f1) do begin
    Readln(f1,endline);
    //Считываем 4 раза по 4 элемента
    inc(j);
  end;
  close(f1);
end"""
code_samples = [py_code, cpp_code, pas_code]
for code in code_samples:
    print('|{}|\n'.format('-' * 20))
    print('\033[92m\n{}\n\033[0m'.format(code))
    proba, pred = text_clf.predict_proba([code]), text_clf.predict([code])
    print(' ' * 3, proba.max(), from_label_to_class(pred))
    print('|{}|\n\n'.format('-' * 20))

|--------------------|

[92m
import os
if __name__ == "__main__":
    print(*range(10), sep='faggot')
[0m


  if diff:


    0.9986695 py
|--------------------|


|--------------------|

[92m
#include <iostream>
<class T>
T doubleIt(T t) {
    return t * 2;
}

void main() {
    std::cout << "Fuck you" << " and also doubleIt(10) = " << doubleIt(10);
    return 0;
}

[0m


  if diff:


    0.97811574 cpp
|--------------------|


|--------------------|

[92m
const N = 4;
var
  f1: text;
  j,i,k,l,endline : integer;
  
begin
assign(f1,'D:\in.txt');
reset(f1);
  while not Eoln(f1) do begin
    Readln(f1,endline);
    //Считываем 4 раза по 4 элемента
    inc(j);
  end;
  close(f1);
end
[0m
    0.9886529 pas
|--------------------|




  if diff:
