# Polyglot

This Ipython notebook is a showcase of Polyglot. It is a program that attempts to identify a programming language given a code snippet. It uses the sklearn library and custom selectrion criteria to generate the best guess for the given snippet.

In [58]:
import glob
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
import itertools
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.base import TransformerMixin
import re
from textblob import TextBlob
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from polyglot_lib import *
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [59]:
def read_code(directory, lang):
    text = []
    files = glob.glob('benchmarks/benchmarksgame/bench/{}/*{}'.format(directory, lang))    
    for file in files:
        with open(file,) as f:
            text.append((f.read(), lang))
    return text

In [60]:
languages = ['.gcc', '.csharp', '.sbcl',
             '.clojure', '.ats',
            '.go', '.hack', '.hs'
            '.java', '.javascript',
            '.jruby', '.ocaml', '.perl', '.tcl'
            '.php', '.python3', '.racket', '.rust',
            '.scala', '.scm', '.vw']

all_langs = [read_code('fasta', lang) for lang in languages]
all_langs  = list(itertools.chain(*all_langs))
langs = [x[0] for x in all_langs]
exts = [x[1] for x in all_langs]

all_langs_fr = [read_code('fastaredux', lang) for lang in languages]
all_langs_fr  = list(itertools.chain(*all_langs_fr))
langs_fr = [x[0] for x in all_langs_fr]
exts_fr = [x[1] for x in all_langs_fr]

all_langs_b = [read_code('binarytrees', lang) for lang in languages]
all_langs_b  = list(itertools.chain(*all_langs_b))
langs_b = [x[0] for x in all_langs_b]
exts_b = [x[1] for x in all_langs_b]

all_langs_m = [read_code('meteor', lang) for lang in languages]
all_langs_m  = list(itertools.chain(*all_langs_m))
langs_m = [x[0] for x in all_langs_m]
exts_m = [x[1] for x in all_langs_m]

all_langs_n = [read_code('knucleotide', lang) for lang in languages]
all_langs_n  = list(itertools.chain(*all_langs_n))
langs_n = [x[0] for x in all_langs_n]
exts_n = [x[1] for x in all_langs_n]

all_langs_r = [read_code('revcomp', lang) for lang in languages]
all_langs_r  = list(itertools.chain(*all_langs_r))
langs_r = [x[0] for x in all_langs_r]
exts_r = [x[1] for x in all_langs_r]

all_langs_rd = [read_code('regexdna', lang) for lang in languages]
all_langs_rd  = list(itertools.chain(*all_langs_rd))
langs_rd = [x[0] for x in all_langs_rd]
exts_rd = [x[1] for x in all_langs_rd]

all_langs_md = [read_code('mandelbrot', lang) for lang in languages]
all_langs_md  = list(itertools.chain(*all_langs_md))
langs_md = [x[0] for x in all_langs_md]
exts_md = [x[1] for x in all_langs_md]

all_langs_s = [read_code('spectralnorm', lang) for lang in languages]
all_langs_s  = list(itertools.chain(*all_langs_s))
langs_s = [x[0] for x in all_langs_s]
exts_s = [x[1] for x in all_langs_s]

all_langs_body = [read_code('nbody', lang) for lang in languages]
all_langs_body  = list(itertools.chain(*all_langs_body))
langs_body = [x[0] for x in all_langs_body]
exts_body = [x[1] for x in all_langs_body]


all_langs_t = [read_code('threadring', lang) for lang in languages]
all_langs_t  = list(itertools.chain(*all_langs_t))
langs_t = [x[0] for x in all_langs_t]
exts_t = [x[1] for x in all_langs_t]



x = langs+langs_fr+langs_b+langs_m+langs_n+langs_r+langs_rd+langs_md+langs_s
x = x+langs_body+langs_t
y = exts+exts_fr+exts_b+exts_m+exts_n+exts_r+exts_rd+exts_md+exts_s
y = y+exts_body+exts_t

In [61]:
def get_test():
    text = []
    for file in range(32):
        with open('test/{}'.format(file+1)) as f:
            text.append((f.read(), file+1))
    return text

ans = pd.read_csv('test.csv', header = None)

In [62]:
repl_ans = ans.replace({'clojure':'.clojure', 'python':'.python3',
                       'javascript':'.javascript', 'ruby':'.jruby',
                       'haskell':'.hs', 'scheme':'.scm', 'java':'.java',
                       'scala':'.scala', 'tcl':'.tcl', 'php':'.php',
                       'ocaml':'.ocaml'})


In [63]:
x_class = [x[0] for x in get_test()]
y_class = list(repl_ans[1])


In [64]:
class FunctionFeaturizer(TransformerMixin):
    def __init__(self, *featurizers):
        self.featurizers = featurizers
        
    def fit(self, X, y=None):
        '''All SciKit-learn compatible transformers and classifiers have the same
        interface. `fit` should always return the same object (self)'''
        return self
    
    def transform(self, X):
        '''Given a list of original data, return a list of feature vectors'''
        feature_vectors = []
        for x in X:
            feature_vector = [f(x) for f in self.featurizers]
            feature_vectors.append(feature_vector)
        
        return np.array(feature_vectors)

In [65]:
class BagOfWordsFeaturizer(TransformerMixin):
    def __init__(self, num_words=None):
        self.num_words = num_words
        
    def fit(self, X, y=None):
        words = []
        for x in X:
            x = TextBlob(x.lower())
            words += [word.lemmatize() for word in x.words]
        if self.num_words:
            words = Counter(words)
            self._vocab = [word for word, _ in words.most_common(self.num_words)]
        else:
            self._vocab = list(set(words))
        return self
    
    def transform(self, X):
        vectors = []
        for x in X:
            x = TextBlob(x.lower())
            word_count = Counter(x.words)
            vector = [0] * len(self._vocab)
            for word, count in word_count.items():
                try:
                    idx = self._vocab.index(word)
                    vector[idx] = count
                except ValueError:
                    pass
            vectors.append(vector)
        return vectors

In [66]:
f = FunctionFeaturizer(percentage_of_parens,
                      percentage_of_bracks,
                      percentage_of_semi,
                      percentage_of_dollar,
                      percentage_of_hyphen,
                      percentage_of_arrow,
                      presence_of_end,
                      presence_of_def,
                      presence_of_elif,
                      presence_of_elsif,
                      presence_of_return,
                      presence_of_defun,
                      presence_of_object,
                      presence_of_public,
                      presence_of_func,
                      presence_of_fun,
                      presence_of_static,
                      percentage_of_ast,
                      presence_of_struct,
                      presence_of_let,
                      presence_of_at,
                      )

In [67]:
code_featurizer = make_union(
    BagOfWordsFeaturizer(70),
    f
)


In [68]:
x_train, x_test, y_train, y_test = train_test_split(x,y)

# Testing Our Data

At this point we have seperated our data into test and training data and have created selection criteria. We will make a pipeline with a Random Forest classifier, which uses multiple decision trees, and with our custom featurizer. The first score shows our test sample from our intake data, the second is from the test files given. 

From the results, it is easy to see that while our program is pretty great at working with the test data from our training data, it is not as good when working with outside test data. 

In [69]:
classifier = MultinomialNB()
vectorizer = CountVectorizer()
vectorizer.fit(x_train)
vectorizer.transform(x_train)


<307x6745 sparse matrix of type '<class 'numpy.int64'>'
	with 38067 stored elements in Compressed Sparse Row format>

In [70]:
random_tree = make_pipeline(code_featurizer, RandomForestClassifier())
random_tree.fit(x_train, y_train)
random_tree.score(x_test, y_test)


0.95145631067961167

In [71]:
random_tree.score(x_class, y_class)


0.5

In [72]:
print(classification_report(random_tree.predict(x_test), y_test))


             precision    recall  f1-score   support

       .ats       1.00      1.00      1.00         4
   .clojure       1.00      1.00      1.00         8
    .csharp       1.00      0.92      0.96        12
       .gcc       1.00      0.92      0.96        13
        .go       0.88      1.00      0.93         7
      .hack       0.90      1.00      0.95         9
.javascript       1.00      0.75      0.86         4
     .jruby       1.00      0.83      0.91         6
     .ocaml       1.00      1.00      1.00         7
      .perl       1.00      1.00      1.00         6
   .python3       0.86      1.00      0.92         6
    .racket       1.00      0.75      0.86         4
      .rust       0.50      1.00      0.67         1
      .sbcl       1.00      1.00      1.00         4
     .scala       1.00      1.00      1.00        10
       .scm       0.00      0.00      0.00         0
        .vw       1.00      1.00      1.00         2

avg / total       0.97      0.95      0.96  

  'recall', 'true', average, warn_for)


In [73]:
print(classification_report(random_tree.predict(x_class), y_class))


             precision    recall  f1-score   support

   .clojure       1.00      0.67      0.80         6
        .go       0.00      0.00      0.00         2
      .hack       0.00      0.00      0.00         4
        .hs       0.00      0.00      0.00         0
      .java       0.00      0.00      0.00         0
.javascript       0.50      1.00      0.67         2
     .jruby       1.00      0.43      0.60         7
     .ocaml       0.50      1.00      0.67         1
      .perl       0.00      0.00      0.00         3
       .php       0.00      0.00      0.00         0
   .python3       0.50      1.00      0.67         2
     .scala       1.00      0.67      0.80         3
       .scm       0.67      1.00      0.80         2
       .tcl       0.00      0.00      0.00         0

avg / total       0.62      0.50      0.51        32



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [74]:
python = '''class BagOfWordsFeaturizer(TransformerMixin):,
    def __init__(self, num_words=None):
        self.num_words = num_words
        
    def fit(self, X, y=None):
        words = []
        for x in X:
            x = TextBlob(x.lower())
            words += [word.lemmatize() for word in x.words]
        if self.num_words:
            words = Counter(words)
            self._vocab = [word for word, _ in words.most_common(self.num_words)]
        else:
            self._vocab = list(set(words))
        return self'''

In [75]:
def get_lang(text):
    ans = random_tree.predict([text])
    print("Your language is probably {}.".format(ans))

In [76]:
get_lang(python)

Your language is probably ['.python3'].
