In [1]:
import csv
import glob
import numpy as np
import pandas as pd
import re
from collections import Counter
from sklearn.base import TransformerMixin
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline, make_union
from sklearn.tree import DecisionTreeClassifier
from textblob import TextBlob

In [2]:
def read_polyglot(extension_list):
    X = []
    y = []
    for extension in extension_list:
        files = glob.glob('data/*.{}'.format(extension))
        for file in files:
            y.append(extension)
            with open(file, encoding='windows-1252') as f:
                X.append(f.read())
    return X, y

In [16]:
# read data into lang_data, lang_results
languages = ['gcc', 'c', 'csharp' 'sbcl', 'clojure', 'ghc' 'java', 'javascript',
             'ocaml', 'perl', 'php', 'hack', 'py', 'python3', 'jruby', 'yarv', 'rb',
             'scala', 'racket']
lang_data, lang_results = read_polyglot(languages)
lang_info = pd.DataFrame(lang_results)

In [93]:
def match_extensions(df):
    '''Match extensions with the name of their programming language
    '''
    df = df.replace(to_replace='gcc', value='C')
    df = df.replace(to_replace='c', value='C')
    df = df.replace(to_replace='csharp', value='C#')
    df = df.replace(to_replace='sbcl', value='Common Lisp')
    df = df.replace(to_replace='clojure', value='Clojure')
    df = df.replace(to_replace='ghc', value='Haskell')
    df = df.replace(to_replace='haskell', value='Haskell')
    df = df.replace(to_replace='java', value='Java')
    df = df.replace(to_replace='javascript', value='JavaScript')
    df = df.replace(to_replace='js', value='JavaScript')
    df = df.replace(to_replace='ocaml', value='OCaml')
    df = df.replace(to_replace='perl', value='Perl')
    df = df.replace(to_replace='php', value='PHP')
    df = df.replace(to_replace='hack', value='PHP')
    df = df.replace(to_replace='python', value='Python')
    df = df.replace(to_replace='python3', value='Python')
    df = df.replace(to_replace='py', value='Python')
    df = df.replace(to_replace='yarv', value='Ruby')
    df = df.replace(to_replace='rb', value='Ruby')
    df = df.replace(to_replace='ruby', value='Ruby')
    df = df.replace(to_replace='jruby', value='Ruby')
    df = df.replace(to_replace='yarv', value='Ruby')
    df = df.replace(to_replace='scala', value='Scala')
    df = df.replace(to_replace='racket', value='Scheme')
    df = df.replace(to_replace='scheme', value='Scheme')
    return df

In [18]:
lang_info = match_extensions(lang_info)
lang_results = list(lang_info)
lang_info[0].value_counts()

Ruby          73
C             59
PHP           55
Scala         43
Clojure       38
Python        36
OCaml         35
Perl          34
Scheme        29
JavaScript    25
dtype: int64

In [6]:
def longest_run_of_capital_letters(text):
    '''Find the longest run of capital letters and return their length'''
    text = re.sub(r'\W', '', text)
    result = re.findall(r'[A-Z]+', text)
    if result:
        return len(sorted(result, key=len, reverse=True)[0])
    return 0

In [7]:
def percentage_of_punctuation(text):
    total_length = len(text)
    text = re.sub(r'[\w\s]', '', text)
    punct_length = len(text)
    
    return punct_length / total_length

In [8]:
class FunctionFeaturizer(TransformerMixin):
    def __init__(self, *featurizers):
        self.featurizers = featurizers
        
    def fit(self, X, y=None):
        '''All SciKit-Learn–Compatible transformers and classifiers have the same interface'''
        return self
    
    def transform(self, X):
        feature_vectors = []
        for x in X:
            feature_vector = [f(x) for f in self.featurizers]
            feature_vectors.append(feature_vector)
            
        return np.array(feature_vectors)

In [9]:
class BagOfWordsFeaturizer(TransformerMixin):
    def __init__(self, num_words=None):
        self.num_words = num_words
        
    def fit(self, X, y=None):
        words = []
        for x in X:
            x = TextBlob(x.lower())
            words += [word.lemmatize() for word in x.words]
        if self.num_words:
            words = Counter(words)
            self._vocab = [word for word, _ in words.most_common(self.num_words)]
        else:
            self._vocab = list(set(words))
        return self
    
    def transform(self, X):
        vectors = []
        for x in X:
            x = TextBlob(x.lower())
            word_count = Counter(x.words)
            vector = [0] * len(self._vocab)
            for word, count in word_count.items():
                try:
                    idx = self._vocab.index(word)
                    vector[idx] = count
                except ValueError:
                    pass
            vectors.append(vector)
        return vectors

In [10]:
lang_featurizer = make_union(
    BagOfWordsFeaturizer(20),
    FunctionFeaturizer(percentage_of_punctuation)
)

X_train, X_test, y_train, y_test = train_test_split(lang_data, lang_results)

pipe = make_pipeline(lang_featurizer, DecisionTreeClassifier())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.79439252336448596

In [11]:
print(classification_report(pipe.predict(X_test), y_test))

             precision    recall  f1-score   support

          C       0.71      1.00      0.83        10
    Clojure       0.75      0.86      0.80         7
 JavaScript       0.50      0.50      0.50         4
      OCaml       0.73      0.89      0.80         9
        PHP       0.71      0.59      0.65        17
       Perl       0.50      0.57      0.53         7
     Python       0.78      0.88      0.82         8
       Ruby       1.00      0.91      0.95        22
      Scala       1.00      0.76      0.87        17
     Scheme       0.83      0.83      0.83         6

avg / total       0.81      0.79      0.80       107



In [99]:
def read_tests():
    X = []
    y = None
    files = glob.glob('test/*')
    for file in files:
        with open(file) as f:
            X.append(f.read())

    with open('test.csv') as f:
        y = f.read()
    
    inter = y.split('\n')
    
    y_out = []
    for pair in inter:
        y_out.append(pair.split(','))
    
    y_out = list(pd.DataFrame(y_out).pop(1))[:-1]
    
    y_df = match_extensions(pd.DataFrame(y_out))
    y_out = list(y_df[0])
    
    return X, y_out

In [100]:
X_assignment_test, y_assignment_test = read_tests()

In [101]:
len(X_assignment_test)

32

In [102]:
y_assignment_test

['Clojure',
 'Clojure',
 'Clojure',
 'Clojure',
 'Python',
 'Python',
 'Python',
 'Python',
 'JavaScript',
 'JavaScript',
 'JavaScript',
 'JavaScript',
 'Ruby',
 'Ruby',
 'Ruby',
 'Haskell',
 'Haskell',
 'Haskell',
 'Scheme',
 'Scheme',
 'Scheme',
 'Java',
 'Java',
 'Scala',
 'Scala',
 'tcl',
 'tcl',
 'PHP',
 'PHP',
 'PHP',
 'OCaml',
 'OCaml']

In [103]:
pipe.score(X_assignment_test, y_assignment_test)

0.125

In [104]:
print(classification_report(pipe.predict(X_assignment_test), y_assignment_test))

             precision    recall  f1-score   support

    Clojure       0.25      0.50      0.33         2
    Haskell       0.00      0.00      0.00         0
       Java       0.00      0.00      0.00         0
 JavaScript       0.00      0.00      0.00         1
      OCaml       0.00      0.00      0.00         1
        PHP       0.00      0.00      0.00         0
       Perl       0.00      0.00      0.00         2
     Python       0.25      0.14      0.18         7
       Ruby       0.67      0.12      0.20        17
      Scala       0.00      0.00      0.00         1
     Scheme       0.00      0.00      0.00         1
        tcl       0.00      0.00      0.00         0

avg / total       0.42      0.12      0.17        32



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
