In [1]:
import csv
from lib.import_lang_data import *
from lib.lang_featurizers import *
# import glob
# import numpy as np
import pandas as pd
import re
from collections import Counter
from sklearn.base import TransformerMixin
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline, make_union
from sklearn.tree import DecisionTreeClassifier
from textblob import TextBlob

In [2]:
# read data into lang_data, lang_results
languages = ['gcc', 'c', 'csharp', 'sbcl', 'clojure', 'ghc' 'java', 'javascript',
             'ocaml', 'perl', 'php', 'hack', 'py', 'python3', 'jruby', 'yarv', 'rb',
             'scala', 'racket']
lang_data, lang_results = read_polyglot(languages)
lang_info = pd.DataFrame(lang_results)
lang_info[0].value_counts()

gcc           58
scala         43
csharp        41
yarv          39
clojure       38
python3       36
ocaml         35
jruby         34
perl          34
sbcl          34
php           29
racket        29
hack          26
javascript    25
c              1
dtype: int64

In [3]:
lang_info = match_extensions(lang_info)
lang_results = list(lang_info[0])
lang_info[0].value_counts()

Ruby           73
C              59
PHP            55
Scala          43
C#             41
Clojure        38
Python         36
OCaml          35
Common Lisp    34
Perl           34
Scheme         29
JavaScript     25
dtype: int64

In [4]:
lang_featurizer = make_union(
    BagOfWordsFeaturizer(20),
    FunctionFeaturizer(num_nil,
                       num_nil_caps,
                       num_null,
                       num_none,
                       num_start_double_semicolons,
                       num_start_hashes,
                       num_bar_hash,
                       percentage_of_punctuation)
)

X_train, X_test, y_train, y_test = train_test_split(lang_data, lang_results)

pipe = make_pipeline(lang_featurizer, DecisionTreeClassifier())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.80158730158730163

In [5]:
print(classification_report(pipe.predict(X_test), y_test))

             precision    recall  f1-score   support

          C       0.56      0.75      0.64        12
         C#       1.00      0.56      0.71        18
    Clojure       0.86      0.86      0.86         7
Common Lisp       0.88      1.00      0.93         7
 JavaScript       0.60      0.60      0.60         5
      OCaml       1.00      1.00      1.00        10
        PHP       0.86      0.71      0.77        17
       Perl       1.00      0.71      0.83         7
     Python       0.83      1.00      0.91        10
       Ruby       1.00      0.95      0.97        19
      Scala       0.36      0.62      0.45         8
     Scheme       0.86      1.00      0.92         6

avg / total       0.85      0.80      0.81       126



## Now to test with the assignment tests

In [6]:
def read_tests():
    X = []
    y = None
    files = glob.glob('test/*')
    for file in files:
        with open(file) as f:
            X.append(f.read())

    with open('test.csv') as f:
        y = f.read()
    
    inter = y.split('\n')
    
    y_out = []
    for pair in inter:
        y_out.append(pair.split(','))
    
    y_out = list(pd.DataFrame(y_out).pop(1))[:-1]
    
    y_df = match_extensions(pd.DataFrame(y_out))
    y_out = list(y_df[0])
    
    return X, y_out

In [7]:
X_assignment_test, y_assignment_test = read_tests()

In [8]:
pipe.score(X_assignment_test, y_assignment_test)

0.03125

In [9]:
print(classification_report(pipe.predict(X_assignment_test), y_assignment_test))

             precision    recall  f1-score   support

          C       0.00      0.00      0.00         1
         C#       0.00      0.00      0.00         9
    Clojure       0.00      0.00      0.00         0
    Haskell       0.00      0.00      0.00         0
       Java       0.00      0.00      0.00         0
 JavaScript       0.25      0.08      0.12        12
      OCaml       0.00      0.00      0.00         4
        PHP       0.00      0.00      0.00         4
     Python       0.00      0.00      0.00         0
       Ruby       0.00      0.00      0.00         0
      Scala       0.00      0.00      0.00         2
     Scheme       0.00      0.00      0.00         0
        TCL       0.00      0.00      0.00         0

avg / total       0.09      0.03      0.05        32



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
