In [2179]:
import glob
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import itertools
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.base import TransformerMixin
import re
from textblob import TextBlob
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from polyglot import *

In [2180]:
def read_code(directory, lang):
    text = []
    files = glob.glob('benchmarks/benchmarksgame/bench/{}/*{}'.format(directory, lang))    
    for file in files:
        with open(file,) as f:
            text.append((f.read(), lang))
    return text

In [2181]:
languages = ['.gcc', 'csharp', '.sbcl',
             '.clojure', '.ats', '.dart',
             '.erlang', '.fpascal', '.fsharp',
            '.gnat', '.go', '.gpp', '.hack', '.hs'
            '.ifc', '.java', '.javascript',
            '.jruby', '.ocaml', '.oz', '.perl',
            '.php', '.python3', '.racket', '.rust',
            '.scala', '.vw', '.yarv']

all_langs = [read_code('fasta', lang) for lang in languages]
all_langs  = list(itertools.chain(*all_langs))
langs = [x[0] for x in all_langs]
exts = [x[1] for x in all_langs]

all_langs_fr = [read_code('fastaredux', lang) for lang in languages]
all_langs_fr  = list(itertools.chain(*all_langs_fr))
langs_fr = [x[0] for x in all_langs_fr]
exts_fr = [x[1] for x in all_langs_fr]

all_langs_b = [read_code('binarytrees', lang) for lang in languages]
all_langs_b  = list(itertools.chain(*all_langs_b))
langs_b = [x[0] for x in all_langs_b]
exts_b = [x[1] for x in all_langs_b]

all_langs_m = [read_code('meteor', lang) for lang in languages]
all_langs_m  = list(itertools.chain(*all_langs_m))
langs_m = [x[0] for x in all_langs_m]
exts_m = [x[1] for x in all_langs_m]

x_train = langs+langs_fr+langs_b+langs_m
y_train = exts+exts_fr+exts_b+exts_m

In [2182]:
def get_test():
    text = []
    for file in range(32):
        with open('test/{}'.format(file+1)) as f:
            text.append((f.read(), file+1))
    return text

ans = pd.read_csv('test.csv', header = None)

In [2183]:
repl_ans = ans.replace({'clojure':'.clojure', 'python':'.python3',
                       'javascript':'.javascript', 'ruby':'.jruby',
                       'haskell':'.hs', 'scheme':'.scm', 'java':'.java',
                       'scala':'.scala', 'tcl':'.tcl', 'php':'.php',
                       'ocaml':'.ocaml'})


In [2184]:
x_test = [x[0] for x in get_test()]
y_test = list(repl_ans[1])


In [2185]:
class FunctionFeaturizer(TransformerMixin):
    def __init__(self, *featurizers):
        self.featurizers = featurizers
        
    def fit(self, X, y=None):
        '''All SciKit-learn compatible transformers and classifiers have the same
        interface. `fit` should always return the same object (self)'''
        return self
    
    def transform(self, X):
        '''Given a list of original data, return a list of feature vectors'''
        feature_vectors = []
        for x in X:
            feature_vector = [f(x) for f in self.featurizers]
            feature_vectors.append(feature_vector)
        
        return np.array(feature_vectors)

In [2186]:
class BagOfWordsFeaturizer(TransformerMixin):
    def __init__(self, num_words=None):
        self.num_words = num_words
        
    def fit(self, X, y=None):
        words = []
        for x in X:
            x = TextBlob(x.lower())
            words += [word.lemmatize() for word in x.words]
        if self.num_words:
            words = Counter(words)
            self._vocab = [word for word, _ in words.most_common(self.num_words)]
        else:
            self._vocab = list(set(words))
        return self
    
    def transform(self, X):
        vectors = []
        for x in X:
            x = TextBlob(x.lower())
            word_count = Counter(x.words)
            vector = [0] * len(self._vocab)
            for word, count in word_count.items():
                try:
                    idx = self._vocab.index(word)
                    vector[idx] = count
                except ValueError:
                    pass
            vectors.append(vector)
        return vectors

In [2187]:
def percentage_of_parens(text):
    total_length = len(text)
    text = re.sub(r'[^()]', '', text)
    punc_length = len(text)
    
    return punc_length / total_length

f = FunctionFeaturizer(percentage_of_parens,
                      percentage_of_bracks,
                      percentage_of_semi,
                      percentage_of_dollar,
                      percentage_of_hyphen,
                      percentage_of_arrow,
                      presence_of_end,
                      presence_of_def,
                      presence_of_elif,
                      presence_of_elsif,
                      presence_of_return,
                      presence_of_defun,
                      presence_of_object,
                      #presence_of_public,
                      presence_of_func,
                      presence_of_fun,
                      presence_of_static,
                      #percentage_of_ast,
                      presence_of_struct,
                      presence_of_let,
                      )

In [2188]:
code_featurizer = make_union(
    BagOfWordsFeaturizer(50),
    f
)

In [2189]:
pipe = make_pipeline(f, DecisionTreeClassifier())
pipe.fit(x_train, y_train)
pipe.score(x_train, y_train)

NameError: name 're' is not defined

In [None]:
#pipe.fit(x_test, y_test)
#pipe.score(x_test, y_test)
#vectorizer = CountVectorizer()
#pipe1 = make_pipeline(code_featurizer, MultinomialNB())
#pipe1.fit(x_train, y_train)
#pipe1.score(x_train, y_train)
#vectorizer.fit(x_train, y_train)
#vectorizer.score(x_train, y_train)

In [None]:
random_tree = make_pipeline(f, RandomForestClassifier())

random_tree.fit(x_train, y_train)
random_tree.score(x_test, y_test)

In [None]:
pipe.score(x_test, y_test)

In [None]:
#x_train = vectorizer.transform(x_train)
#x_test = vectorizer.transform(x_test)

In [None]:
#classifier = MultinomialNB()

In [None]:

#classifier.fit(x_train, y_train)

In [None]:
#classifier.score(x_train, y_train)

In [None]:
#classifier.score(x_test, y_test)

In [None]:
print(classification_report(random_tree.predict(x_test), y_test))
