In [1]:
import glob
import os
import random
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import MinMaxScaler
from sklearn.cross_validation import train_test_split
import re
import scipy
import pandas as pd

## Link languages and extentions

In [2]:
extentions = {"c":['.gcc', '.c'], "c#":[".cs",'.csharp'], "common lisp":['.sbcl'], 
              "clojure":['.clj','.clojure'], "haskell":['.hs','lhs','.ghc'], 
              'java':['.java','.class','.jar'], 'javascript':['.js','.javascript'],
              'ocaml':['.ocaml'], 'perl':['.pl','.pm','.t','.pod','.perl'],'php':['.hack','.php'],
              'python':['.py','.python3'], 'ruby':['.jruby','.yarv'], 'scala':['.scala','.sc'],
              'scheme':['.racket']
             }

langs = {}
for k,v in extentions.items():
    for j in v:
        langs[j]=k

## Find sample files, ID language and chose relevent ones

In [3]:
all_files = glob.iglob('./benchmarksgame-2014-08-31/benchmarksgame/bench/**',recursive=True)
relevant_files = []
languages = []
for each_file in all_files:
    file_name, file_ext = os.path.splitext(each_file)
    if file_ext in langs:
        relevant_files.append(each_file)
        languages.append(langs[file_ext])

## Exploration without Pipeline
### Words only
#### Vectorizing everything

In [4]:
word_counter = CountVectorizer(input="filename",decode_error="ignore")
counted_by_words = word_counter.fit_transform(relevant_files)

In [25]:
counted_by_words.todense()[0].max()

23

#### Classifying on everything

In [5]:
classifier_by_words = MultinomialNB()
classifier_by_words.fit(counted_by_words, languages)
classifier_by_words.score(counted_by_words, languages)

0.98634812286689422

#### Crossvalidation

In [6]:
train_x, test_x, train_y, test_y = train_test_split(counted_by_words, languages)
test_classifier_by_words = MultinomialNB()
test_classifier_by_words.fit(train_x, train_y)
print("Training Score: ", test_classifier_by_words.score(train_x, train_y))
print("Test Score: ", test_classifier_by_words.score(test_x, test_y))

Training Score:  0.986332574032
Test Score:  0.87074829932


### Single Characters
#### Vectorizing Everything

In [7]:
character_counter =  CountVectorizer(input="filename",decode_error="ignore",
                                    tokenizer=lambda x:re.findall(r'.',x)
                                    )
counted_by_characters = character_counter.fit_transform(relevant_files)

#### Classifying on everything

In [8]:
classifier_by_characters = MultinomialNB()
classifier_by_characters.fit(counted_by_characters, languages)
classifier_by_characters.score(counted_by_characters, languages)

0.80887372013651881

#### Crossvalidation

In [9]:
train_x, test_x, train_y, test_y = train_test_split(counted_by_characters, languages)
test_classifier_by_characters = MultinomialNB()
test_classifier_by_characters.fit(train_x, train_y)
print("Training Score: ", test_classifier_by_characters.score(train_x, train_y))
print("Test Score: ", test_classifier_by_characters.score(test_x, test_y))

Training Score:  0.788154897494
Test Score:  0.734693877551


### Pairs of characters
#### Vectorizing everything

In [10]:
character_2_counter =  CountVectorizer(input="filename",decode_error="ignore",
                                      ngram_range=(2,2),
                                      tokenizer=lambda x:re.findall(r'.',x)
                                      )
counted_by_2characters = character_2_counter.fit_transform(relevant_files)

#### Classifying on everything

In [11]:
classifier_by_2_characters = MultinomialNB()
classifier_by_2_characters.fit(counted_by_2characters, languages)
classifier_by_2_characters.score(counted_by_2characters, languages)

0.96587030716723554

#### Crossvalidation

In [12]:
train_x, test_x, train_y, test_y = train_test_split(counted_by_2characters, languages)
test_classifier_by_2_characters = MultinomialNB()
test_classifier_by_2_characters.fit(train_x, train_y)
print("Training Score: ", test_classifier_by_2_characters.score(train_x, train_y))
print("Test Score: ", test_classifier_by_2_characters.score(test_x, test_y))

Training Score:  0.95444191344
Test Score:  0.850340136054


###  All three together
#### Combining previous data

In [13]:
all_vectors = scipy.sparse.hstack([counted_by_words,
                                   counted_by_characters,
                                   counted_by_2characters
                                  ])

#### Classifying on everything

In [14]:
all_classifier = MultinomialNB()
all_classifier.fit(all_vectors, languages)
# print(all_classifier.predict(all_vectors[1]))
all_classifier.score(all_vectors, languages)

0.97610921501706482

#### Crossvalidation

In [15]:
train_x, test_x, train_y, test_y = train_test_split(all_vectors, languages)
test_all_classifier = MultinomialNB()
test_all_classifier.fit(train_x, train_y)
print("Training Score: ", test_all_classifier.score(train_x, train_y))
print("Test Score: ", test_all_classifier.score(test_x, test_y))

Training Score:  0.970387243736
Test Score:  0.945578231293


#### Find probabilities

In [27]:
all_dense = all_vectors.todense()
pd.set_option('precision', 2)
probabilities = all_classifier.predict_proba(all_vectors)
probabilities = pd.DataFrame(probabilities)
probabilities.columns = all_classifier.classes_
probabilities["Actual"] = languages
probabilities.set_index("Actual",inplace = True)
probabilities

Unnamed: 0_level_0,c,c#,clojure,common lisp,haskell,java,javascript,ocaml,perl,php,python,ruby,scala,scheme
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
clojure,0.00e+00,0.00e+00,1.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00
clojure,0.00e+00,0.00e+00,1.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00
clojure,0.00e+00,0.00e+00,1.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00
c#,3.09e-52,1.16e-30,0.00e+00,0.00e+00,0.00e+00,1.00e+00,0.00e+00,0.00e+00,0.00e+00,2.36e-292,0.00e+00,1.19e-322,6.29e-171,0.00e+00
c#,2.58e-122,1.62e-56,0.00e+00,0.00e+00,0.00e+00,1.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,4.24e-197,0.00e+00
c,1.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00
c,1.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00
c,1.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00
c,1.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00
c,1.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00


## Compare with Test data from assignment
#### Requires test files in /test_data under working directory

### Read test files

In [17]:
test_files = []
for i in range(1,33):
    test_files.append("./test_data/{}.txt".format(i))

### Vectorize test files for each type of data and combine

In [18]:
test_words = word_counter.transform(test_files)
test_chars = character_counter.transform(test_files)
test_2chars = character_2_counter.transform(test_files)
test_all = scipy.sparse.hstack([test_words,
                                test_chars,
                                test_2chars
                               ])

### Make predictions and display

In [19]:
predicts = all_classifier.predict(test_all)

In [20]:
for i in zip (test_files, predicts):
    print(i)

('./test_data/1.txt', 'clojure')
('./test_data/2.txt', 'clojure')
('./test_data/3.txt', 'clojure')
('./test_data/4.txt', 'clojure')
('./test_data/5.txt', 'python')
('./test_data/6.txt', 'python')
('./test_data/7.txt', 'python')
('./test_data/8.txt', 'python')
('./test_data/9.txt', 'scala')
('./test_data/10.txt', 'python')
('./test_data/11.txt', 'scala')
('./test_data/12.txt', 'scala')
('./test_data/13.txt', 'ruby')
('./test_data/14.txt', 'haskell')
('./test_data/15.txt', 'ruby')
('./test_data/16.txt', 'haskell')
('./test_data/17.txt', 'haskell')
('./test_data/18.txt', 'haskell')
('./test_data/19.txt', 'common lisp')
('./test_data/20.txt', 'scheme')
('./test_data/21.txt', 'scheme')
('./test_data/22.txt', 'c')
('./test_data/23.txt', 'c')
('./test_data/24.txt', 'scala')
('./test_data/25.txt', 'scala')
('./test_data/26.txt', 'perl')
('./test_data/27.txt', 'perl')
('./test_data/28.txt', 'c')
('./test_data/29.txt', 'php')
('./test_data/30.txt', 'php')
('./test_data/31.txt', 'ocaml')
('./test

#### That doesn't seem very good. How bad is it?
Lets check the score
#### We need the files in the same order as the key

In [21]:
actual = 'clojure,clojure,clojure,clojure,python,python,python,python,javascript,javascript,javascript,javascript,ruby,ruby,ruby,haskell,haskell,haskell,scheme,scheme,scheme,java,java,scala,scala,tcl,tcl,php,php,php,ocaml,ocaml'

In [22]:
actual = actual.split(',')

In [23]:
all_classifier.score(test_all,actual)

0.65625

That's better than I thought but still not great. Also I seem to be missing a lot of JavaScript IDs.
I suspect that adding tfidf to the ssytem would improve the performance aas ight adding more data.

Next Steps:
* Put together a pipeline with FeatureUnion to prform this modeling.
* Add tfidf to the pipeline
* look for more data
* concider additional metrics
* look into better visualization
* off-load functions to a script
* unit-tests