In [1]:
from bs4 import BeautifulSoup
from lib.import_lang_data import *
from lib.lang_featurizers import *
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline, make_union
from sklearn.tree import DecisionTreeClassifier

In [2]:
# read in assignment's test files
X_assignment_test, y_assignment_test = read_tests()

# read corpus into lang_data, lang_results
languages = ['gcc', 'c', 'csharp', 'sbcl', 'clojure', 'ghc' 'java', 'javascript',
             'ocaml', 'perl', 'php', 'hack', 'py', 'python3', 'jruby', 'yarv', 'rb',
             'scala', 'racket', 'tcl']
lang_data, lang_results = read_polyglot(languages)
lang_info = pd.DataFrame(lang_results)
lang_info[0].value_counts()

gcc           58
scala         43
csharp        41
yarv          39
clojure       38
python3       36
ocaml         35
perl          34
jruby         34
sbcl          34
racket        29
php           29
hack          26
javascript    25
tcl            9
c              1
dtype: int64

In [3]:
lang_info = match_extensions(lang_info)
lang_results = list(lang_info[0])
lang_info[0].value_counts()

Ruby           73
C              59
PHP            55
Scala          43
C#             41
Clojure        38
Python         36
OCaml          35
Perl           34
Common Lisp    34
Scheme         29
JavaScript     25
TCL             9
dtype: int64

In [4]:
lang_featurizer = make_union(
    BagOfWordsFeaturizer(52),
    FunctionFeaturizer(presence_nil,
                       presence_nil_caps,
                       presence_null,
                       presence_none,
#                        presence_start_double_semicolons,
#                        presence_start_hashes,
#                        presence_bar_hash,
                       presence_paren_define,
                       percent_start_and_end_parenthesis,
                       longest_run_of_parenthesis,
                       longest_run_of_curly_braces,
                       single_closing_braces_per_line,
                       presence_function_js,
                       presence_while,
                       presence_do,
                       presence_var,
                       presence_for_js,
                       presence_plus_equals,
                       presence_js_case_open_square,
#                        final_semicolons_per_line,
                       presence_void,
                       presence_public,
                       presence_bool,
                       presence_struct,
                       presence_new,
                       presence_this_dot,
#                        presence_int,
                       presence_module_line,
                       presence_extend_line,
                       presence_require_line,
#                        presence_end,
                       presence_multiple_end,
                       presence_def_no_colon,
                       presence_at,
                       presence_double_at,
#                        presence_puts,
#                        presence_puts_not_proc,
                       presence_elif,
                       presence_dot_times,
                       presence_paren_defn,
                       presence_paren_ns,
                       percent_consecutive_closing_paren,
                       presence_taskloop,
                       presence_runtask,
                       presence_from_import_line,
                       presence_import_line,
                       presence_print_paren,
                       presence_dot_join,
                       presence_dot_format,
                       presence_dot_values,
                       presence_dunder_name,
                       presence_dunder_init,
                       presence_def_colon,
                       presence_let,
#                        presence_snake_case,
                       presence_naked_colon,
                       presence_naked_lt_minus,
                       percent_dollar_lower,
                       presence_dollar_minus_gt,
                       presence_function_php,
                       presence_gt_question,
                       presence_elseif,
                       presence_proc,
                       percent_curly_braces,
                       )
)

X_train, X_test, y_train, y_test = train_test_split(lang_data, lang_results)

pipe = make_pipeline(lang_featurizer, DecisionTreeClassifier())
pipe.fit(X_train, y_train)
print('R^2 score: {}\n'.format(pipe.score(X_test, y_test)))
print(classification_report(y_test, pipe.predict(X_test)))

R^2 score: 0.984375

             precision    recall  f1-score   support

          C       1.00      1.00      1.00        12
         C#       1.00      1.00      1.00        11
    Clojure       1.00      1.00      1.00         7
Common Lisp       1.00      1.00      1.00         6
 JavaScript       1.00      0.83      0.91         6
      OCaml       1.00      1.00      1.00        11
        PHP       0.94      1.00      0.97        16
       Perl       1.00      0.89      0.94         9
     Python       1.00      1.00      1.00        12
       Ruby       0.93      1.00      0.97        14
      Scala       1.00      1.00      1.00        14
     Scheme       1.00      1.00      1.00         6
        TCL       1.00      1.00      1.00         4

avg / total       0.99      0.98      0.98       128



## Now to test with the assignment's tests

In [5]:
print('R^2 score: {}\n'.format(pipe.score(X_assignment_test, y_assignment_test)))
print(classification_report(y_assignment_test, pipe.predict(X_assignment_test)))
print(confusion_matrix(y_assignment_test, pipe.predict(X_assignment_test)))

R^2 score: 0.59375

             precision    recall  f1-score   support

          C       0.00      0.00      0.00         0
    Clojure       1.00      0.25      0.40         4
Common Lisp       0.00      0.00      0.00         0
    Haskell       0.00      0.00      0.00         3
       Java       0.00      0.00      0.00         2
 JavaScript       1.00      1.00      1.00         4
      OCaml       0.50      1.00      0.67         2
        PHP       0.00      0.00      0.00         3
       Perl       0.00      0.00      0.00         0
     Python       1.00      0.75      0.86         4
       Ruby       0.38      1.00      0.55         3
      Scala       1.00      0.50      0.67         2
     Scheme       1.00      1.00      1.00         3
        TCL       0.67      1.00      0.80         2

avg / total       0.64      0.59      0.56        32

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 1 0 0 0 0 0 0 0 2 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 2 0 0 0 1 0 0 0]
 [0 0

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


## Conclusion

It is not easy to accurately identify the programming language used to create a snippet. It takes checking for several specific syntax structures for each language. I did not have time to write enough of these to sufficiently identify the assignment's test snippets, but I was able to get my training set to very accurately identify its test set.