In [11]:
import csv
import glob
import numpy as np
import re
from collections import Counter
from sklearn.base import TransformerMixin
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline, make_union
from sklearn.tree import DecisionTreeClassifier
from textblob import TextBlob

In [44]:
def read_polyglot(extension_list):
    for extension in extension_list:
        files = glob.glob('data/*.{}'.format(extension))
        X = []
        y = []
        for file in files:
            y.append(extension)
            with open(file, encoding='windows-1252') as f:
                X.append(f.read())
    return X, y

In [45]:
# read data into lang_data, lang_results
languages = ['gcc', 'c', 'csharp' 'sbcl', 'clojure', 'ghc' 'java', 'javascript',
             'ocaml', 'perl', 'php', 'hack', 'py', 'python3', 'jruby', 'yarv', 'rb',
             'scala', 'racket']
lang_data, lang_results = read_polyglot(languages)

In [48]:
print(lang_data[:1])
print(lang_results[:1])

['#lang racket/base\n\n;;; The Computer Language Benchmarks Game\n;;; http://benchmarksgame.alioth.debian.org/\n\n;;; Derived from the Chicken variant by Sven Hartrumpf\n;;; contributed by Matthew Flatt\n\n(require racket/cmdline)\n\n(struct node (left val right))\n\n;; Instead of (define-struct leaf (val)):\n(define (leaf val) (node #f val #f))\n(define (leaf? l) (not (node-left l)))\n(define (leaf-val l) (node-val l))\n\n(define (make item d)\n  (if (= d 0)\n      (leaf item)\n      (let ((item2 (* item 2))\n            (d2 (- d 1)))\n        (node (make (- item2 1) d2) \n              item \n              (make item2 d2)))))\n\n(define (check t)\n  (if (leaf? t)\n      (leaf-val t)\n      (+ (node-val t) (- (check (node-left t)) \n                         (check (node-right t))))))\n\n(define (main n)\n  (let* ((min-depth 4)\n         (max-depth (max (+ min-depth 2) n)))\n    (let ((stretch-depth (+ max-depth 1)))\n      (printf "stretch tree of depth ~a\\t check: ~a\\n"\n          

In [6]:
def longest_run_of_capital_letters(text):
    '''Find the longest run of capital letters and return their length'''
    text = re.sub(r'\W', '', text)
    result = re.findall(r'[A-Z]+', text)
    if result:
        return len(sorted(result, key=len, reverse=True)[0])
    return 0

In [7]:
def percentage_of_punctuation(text):
    total_length = len(text)
    text = re.sub(r'[\w\s]', '', text)
    punct_length = len(text)
    
    return punct_length / total_length

In [3]:
class FunctionFeaturizer(TransformerMixin):
    def __init__(self, *featurizers):
        self.featurizers = featurizers
        
    def fit(self, X, y=None):
        '''All SciKit-Learn–Compatible transformers and classifiers have the same interface'''
        return self
    
    def transform(self, X):
        feature_vectors = []
        for x in X:
            feature_vector = [f(x) for f in self.featurizers]
            feature_vectors.append(feature_vector)
            
        return np.array(feature_vectors)

In [4]:
class BagOfWordsFeaturizer(TransformerMixin):
    def __init__(self, num_words=None):
        self.num_words = num_words
        
    def fit(self, X, y=None):
        words = []
        for x in X:
            x = TextBlob(x.lower())
            words += [word.lemmatize() for word in x.words]
        if self.num_words:
            words = Counter(words)
            self._vocab = [word for word, _ in words.most_common(self.num_words)]
        else:
            self._vocab = list(set(words))
        return self
    
    def transform(self, X):
        vectors = []
        for x in X:
            x = TextBlob(x.lower())
            word_count = Counter(x.words)
            vector = [0] * len(self._vocab)
            for word, count in word_count.items():
                try:
                    idx = self._vocab.index(word)
                    vector[idx] = count
                except ValueError:
                    pass
            vectors.append(vector)
        return vectors

In [None]:
lang_featurizer = make_union(
    BagOfWordsFeaturizer(20),
    FunctionFeaturizer(len,
                       longest_run_of_capital_letters,
                       percentage_of_punctuation)
)

X_train, X_test, y_train, y_test = train_test_split(lang_data, lang_results)

pipe = make_pipeline(lang_featurizer, DecisionTreeClassifier())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

In [None]:
print(classification_report(pipe.predict(X_test), y_test))