In [1]:
import csv
from lib.import_lang_data import *
# import glob
import numpy as np
import pandas as pd
import re
from collections import Counter
from sklearn.base import TransformerMixin
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline, make_union
from sklearn.tree import DecisionTreeClassifier
from textblob import TextBlob

In [2]:
# def read_polyglot(extension_list):
#     X = []
#     y = []
#     for extension in extension_list:
#         files = glob.glob('data/*.{}'.format(extension))
#         for file in files:
#             y.append(extension)
#             with open(file, encoding='windows-1252') as f:
#                 X.append(f.read())
#     return X, y

In [3]:
# read data into lang_data, lang_results
languages = ['gcc', 'c', 'csharp', 'sbcl', 'clojure', 'ghc' 'java', 'javascript',
             'ocaml', 'perl', 'php', 'hack', 'py', 'python3', 'jruby', 'yarv', 'rb',
             'scala', 'racket']
lang_data, lang_results = read_polyglot(languages)
lang_info = pd.DataFrame(lang_results)
lang_info[0].value_counts()

gcc           58
scala         43
csharp        41
yarv          39
clojure       38
python3       36
ocaml         35
sbcl          34
jruby         34
perl          34
php           29
racket        29
hack          26
javascript    25
c              1
dtype: int64

In [4]:
# def match_extensions(df):
#     '''Match extensions with the name of their programming language
#     '''
#     df[0] = df[0].str.replace(r'^gcc$', 'C')
#     df[0] = df[0].str.replace(r'^c$','C')
#     df[0] = df[0].str.replace(r'csharp', 'C#')
#     df[0] = df[0].str.replace(r'tcl', 'TCL')
#     df[0] = df[0].str.replace(r'sbcl', 'Common Lisp')
#     df[0] = df[0].str.replace(r'clojure', 'Clojure')
#     df[0] = df[0].str.replace(r'ghc', 'Haskell')
#     df[0] = df[0].str.replace(r'haskell', 'Haskell')
#     df[0] = df[0].str.replace(r'javascript', 'JavaScript')
#     df[0] = df[0].str.replace(r'java', 'Java')
#     df[0] = df[0].str.replace(r'js', 'JavaScript')
#     df[0] = df[0].str.replace(r'ocaml', 'OCaml')
#     df[0] = df[0].str.replace(r'perl', 'Perl')
#     df[0] = df[0].str.replace(r'php', 'PHP')
#     df[0] = df[0].str.replace(r'hack', 'PHP')
#     df[0] = df[0].str.replace(r'python3', 'Python')
#     df[0] = df[0].str.replace(r'python', 'Python')
#     df[0] = df[0].str.replace(r'py', 'Python')
#     df[0] = df[0].str.replace(r'yarv', 'Ruby')
#     df[0] = df[0].str.replace(r'rb', 'Ruby')
#     df[0] = df[0].str.replace(r'ruby', 'Ruby')
#     df[0] = df[0].str.replace(r'jruby', 'Ruby')
#     df[0] = df[0].str.replace(r'yarv', 'Ruby')
#     df[0] = df[0].str.replace(r'scala', 'Scala')
#     df[0] = df[0].str.replace(r'racket', 'Scheme')
#     df[0] = df[0].str.replace(r'scheme', 'Scheme')
#     return df

In [5]:
lang_info = match_extensions(lang_info)
lang_results = list(lang_info[0])
lang_info[0].value_counts()

C              59
PHP            55
Scala          43
C#             41
Ruby           39
Clojure        38
Python         36
OCaml          35
jRuby          34
Perl           34
Common Lisp    34
Scheme         29
JavaScript     25
dtype: int64

In [6]:
def longest_run_of_capital_letters(text):
    '''Find the longest run of capital letters and return their length'''
    text = re.sub(r'\W', '', text)
    result = re.findall(r'[A-Z]+', text)
    if result:
        return len(sorted(result, key=len, reverse=True)[0])
    return 0

In [7]:
def num_nil(text):
    result = re.findall(r'\W+nil\W+', text)
    if result:
        return len(result)
    else:
        return 0

In [8]:
def percentage_of_punctuation(text):
    total_length = len(text)
    text = re.sub(r'[\w\s]', '', text)
    punct_length = len(text)
    
    return punct_length / total_length

In [9]:
class FunctionFeaturizer(TransformerMixin):
    def __init__(self, *featurizers):
        self.featurizers = featurizers
        
    def fit(self, X, y=None):
        '''All SciKit-Learn–Compatible transformers and classifiers have the same interface'''
        return self
    
    def transform(self, X):
        feature_vectors = []
        for x in X:
            feature_vector = [f(x) for f in self.featurizers]
            feature_vectors.append(feature_vector)
            
        return np.array(feature_vectors)

In [10]:
class BagOfWordsFeaturizer(TransformerMixin):
    def __init__(self, num_words=None):
        self.num_words = num_words
        
    def fit(self, X, y=None):
        words = []
        for x in X:
            x = TextBlob(x.lower())
            words += [word.lemmatize() for word in x.words]
        if self.num_words:
            words = Counter(words)
            self._vocab = [word for word, _ in words.most_common(self.num_words)]
        else:
            self._vocab = list(set(words))
        return self
    
    def transform(self, X):
        vectors = []
        for x in X:
            x = TextBlob(x.lower())
            word_count = Counter(x.words)
            vector = [0] * len(self._vocab)
            for word, count in word_count.items():
                try:
                    idx = self._vocab.index(word)
                    vector[idx] = count
                except ValueError:
                    pass
            vectors.append(vector)
        return vectors

In [11]:
lang_featurizer = make_union(
    BagOfWordsFeaturizer(20),
    FunctionFeaturizer(num_nil,
                       percentage_of_punctuation)
)

X_train, X_test, y_train, y_test = train_test_split(lang_data, lang_results)

pipe = make_pipeline(lang_featurizer, DecisionTreeClassifier())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.62698412698412698

In [12]:
print(classification_report(pipe.predict(X_test), y_test))

             precision    recall  f1-score   support

          C       0.71      0.83      0.77        12
         C#       0.91      0.77      0.83        13
    Clojure       0.70      0.58      0.64        12
Common Lisp       0.62      0.71      0.67         7
 JavaScript       0.50      0.50      0.50         6
      OCaml       0.75      1.00      0.86         3
        PHP       0.86      0.67      0.75        18
       Perl       0.12      0.33      0.18         3
     Python       0.70      0.70      0.70        10
       Ruby       0.60      0.38      0.46        16
      Scala       0.57      0.62      0.59        13
     Scheme       0.86      0.75      0.80         8
      jRuby       0.10      0.20      0.13         5

avg / total       0.68      0.63      0.64       126



## Now to test with the assignment tests

In [13]:
def read_tests():
    X = []
    y = None
    files = glob.glob('test/*')
    for file in files:
        with open(file) as f:
            X.append(f.read())

    with open('test.csv') as f:
        y = f.read()
    
    inter = y.split('\n')
    
    y_out = []
    for pair in inter:
        y_out.append(pair.split(','))
    
    y_out = list(pd.DataFrame(y_out).pop(1))[:-1]
    
    y_df = match_extensions(pd.DataFrame(y_out))
    y_out = list(y_df[0])
    
    return X, y_out

In [14]:
X_assignment_test, y_assignment_test = read_tests()

In [15]:
pipe.score(X_assignment_test, y_assignment_test)

0.03125

In [16]:
print(classification_report(pipe.predict(X_assignment_test), y_assignment_test))

             precision    recall  f1-score   support

          C       0.00      0.00      0.00         1
    Clojure       0.00      0.00      0.00         7
Common Lisp       0.00      0.00      0.00         2
    Haskell       0.00      0.00      0.00         0
       Java       0.00      0.00      0.00         0
 JavaScript       0.00      0.00      0.00         0
      OCaml       0.00      0.00      0.00         2
        PHP       0.00      0.00      0.00         0
       Perl       0.00      0.00      0.00         3
     Python       0.25      0.20      0.22         5
       Ruby       0.00      0.00      0.00         3
      Scala       0.00      0.00      0.00         1
     Scheme       0.00      0.00      0.00         0
        TCL       0.00      0.00      0.00         0
      jRuby       0.00      0.00      0.00         8

avg / total       0.04      0.03      0.03        32



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
