# Naive Bayes (the easy way)

Prepare data set (can be skipped, it is already there, see source folder)

In [2]:
import os
import shutil

source_root = '/Users/apismenskiy/git'
directory_path = os.path.join(source_root, 'source')
output_path = 'source'
java_path = os.path.join(output_path, 'java')
cpp_path = os.path.join(output_path, 'cpp')
scala_path = os.path.join(output_path, 'scala')
js_path = os.path.join(output_path, 'javascript')
py_path = os.path.join(output_path, 'python')
text_path = os.path.join(output_path, 'plaintext')

# we can use this code to create a new data sets if we want to rebuild the model
# def find_files_with_extension(directory, extension):
#     file_list = []
# 
#     for root, dirs, files in os.walk(directory):
#         for file in files:
#             if file.endswith(extension):
#                 file_list.append(os.path.join(root, file))
# 
#     return file_list

# def create_and_move_files(directory, extension, output_folder_name):
#     found_files = find_files_with_extension(directory, extension)
# 
#     if not found_files:
#         print(f"No files with '{extension}' extension found in the specified directory.")
#         return
# 
#     if not os.path.exists(output_folder_name):
#         os.makedirs(output_folder_name)
#         print(f"Created folder '{output_folder_name}' to store files.")
# 
#     for file_path in found_files:
#         new_file_path = os.path.join(output_folder_name, os.path.basename(file_path))
#         shutil.move(file_path, new_file_path)
#         print(f"Moved '{file_path}' to '{new_file_path}'")



# create_and_move_files(os.path.join(source_root, 'tika'), 'java', java_path)
# create_and_move_files(os.path.join(source_root, 'tesseract'), 'cpp', cpp_path)
# create_and_move_files(os.path.join(source_root, 'playframework'), 'scala', scala_path)
# create_and_move_files(os.path.join(source_root, 'jquery'), 'js', js_path)
# create_and_move_files(os.path.join(source_root, 'scikit-learn'), 'py', py_path)


We'll cheat by using sklearn.naive_bayes to train a source code classifier! Most of the code is just loading our training data into a pandas DataFrame that we can play with:

In [95]:
import os
import io
import pandas as pd
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import re

def remove_comments(code):
    # Remove single-line comments starting with "//" or "#"
    code = re.sub(r'(\/\/[^\n]*|#[^\n]*)', '', code)

    # Remove multi-line comments enclosed within '/*' and '*/'
    code = re.sub(r'\/\*[\s\S]*?\*\/', '', code)

    return code

def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)

            inBody = False
            lines = []
            f = io.open(path, 'r', encoding='latin1')
            for line in f:
                if inBody:
                    lines.append(line)
                elif line == '\n':
                    inBody = True
            f.close()
            message = '\n'.join(lines)
            # remove comments
            # todo in the real world we may need to add more robust logic for tokenization
            # ie for prgramming languages we want to exclude string and numeric literals, names (field, method, variables) etc and may be leave types, operands,  braces, brackets, spaces and reserved words? 
            message = remove_comments(message)
            yield path, message


def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)

    return DataFrame(rows, index=index)

data = DataFrame({'message': [], 'class': []})

data = pd.concat([data, dataFrameFromDirectory(cpp_path, "cpp")])
data = pd.concat([data, dataFrameFromDirectory(java_path, "java")])
data = pd.concat([data, dataFrameFromDirectory(js_path, "javascript")])
data = pd.concat([data, dataFrameFromDirectory(py_path, "python")])
data = pd.concat([data, dataFrameFromDirectory(scala_path, "scala")])
data = pd.concat([data, dataFrameFromDirectory(text_path, "text")])

# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


Let's have a look at that DataFrame:

In [96]:
train_data.head()

Unnamed: 0,message,class
source/java/StreamObjectHeaderEnd8bit.java,package org.apache.tika.parser.microsoft.oneno...,java
source/java/OpenDocumentBodyHandler.java,import static org.apache.tika.sax.XHTMLContent...,java
source/plaintext/huge_5560.txt,=American Foreign Relations=\n\n\n\n I. Co...,text
source/python/plot_permutation_importance.py,"In this example, we will compare the impurity-...",python
source/plaintext/huge_2069.txt,"In the fresh morning air were now heard, not t...",text


In [97]:
test_data.head()

Unnamed: 0,message,class
source/plaintext/huge_2698.txt,"\n\n""And who could it be who was her confedera...",text
source/java/AutoDetectParserTest.java,import static java.nio.charset.StandardCharset...,java
source/plaintext/huge_2854.txt,\n\n=The Triumph of Industry.=--The wreck of t...,text
source/plaintext/huge_2741.txt,_The Acts against Manufactures._--The second g...,text
source/plaintext/huge_3643.txt,An adjutant galloped up from the fleches with ...,text


Now we will use a CountVectorizer to split up each message into its list of words, and throw that into a MultinomialNB classifier. Call fit() and we've got a trained source code detector ready to go! It's just that easy.

In [126]:

# Custom tokenizer that tokenizes based on braces, brackets, and spaces. Those are the most common programming characters, we want to count them as separate tokens.
programming_chars = ['(', ')','{', '}', '[', ']', ' ', '=', '+', '-', '*', '/', '#', '!', '^', '?', '"', "'", ';', '.']
def custom_tokenizer(text):
    tokens = []
    current_token = ""
    for char in text:
        if char in programming_chars:
            if current_token:
                tokens.append(current_token)
            tokens.append(char)
            current_token = ""
        else:
            current_token += char
    if current_token and len(current_token.strip()) > 0:
        tokens.append(current_token)
    return tokens

In [127]:
vectorizer = CountVectorizer(tokenizer=custom_tokenizer)
#vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(train_data['message'].values)

classifier = MultinomialNB()
targets = train_data['class'].values
classifier.fit(counts, targets)



In [128]:
# Example text
# text = " Math.round(Math.random() * 250)"
# 
# # Fit and transform using the custom tokenizer
# X = vectorizer.fit_transform([text])
# 
# # Resulting vocabulary and matrix
# print(vectorizer.get_feature_names_out())
# print(X.toarray())

Let's try it out:

In [129]:
test_messages = vectorizer.transform(test_data['message'].values)
predictions = classifier.predict(test_messages)

And check the accuracy:

In [130]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

true_labels = test_data['class'].values
accuracy = accuracy_score(true_labels, predictions)
print("Accuracy:", accuracy)

# Calculate precision, recall, and F1-score
precision = precision_score(true_labels, predictions, average='weighted')
recall = recall_score(true_labels, predictions, average='weighted')
f1 = f1_score(true_labels, predictions, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

# Create a confusion matrix
conf_matrix = confusion_matrix(true_labels, predictions)
print("Confusion Matrix:\n", conf_matrix)


Accuracy: 0.9952558454761098
Precision: 0.995218550678817
Recall: 0.9952558454761098
F1-Score: 0.9951118386216987
Confusion Matrix:
 [[  60    0    0    1    0    0]
 [   0  323    0    0    0    0]
 [   0    0   30    0    2    6]
 [   0    0    0  160    0    0]
 [   0    0    0    0  143    1]
 [   0    0    1    1    2 2221]]


In [131]:
examples = [
    # scala
    '''package org.threeten.bp

import java.util.NavigableMap
import org.threeten.bp.zone.ZoneMap

object Platform {
type NPE = NullPointerException
type DFE = IndexOutOfBoundsException
type CCE = ClassCastException

/**
* Returns `true` if and only if the code is executing on a JVM. Note: Returns `false` when
* executing on any JS VM.
*/
final val executingInJVM = true

def setupLocales(): Unit = {}

def zoneMap(m: scala.collection.immutable.TreeMap[Int, String]): NavigableMap[Int, String] =
ZoneMap(m)
}''',
    # java
    ''' public static void run() {

ProfileCredentialsProvider awsCredentialsProvider = ProfileCredentialsProvider.create();

CLIENT = TextractClient.builder()
        .region(region)
        .credentialsProvider(awsCredentialsProvider)
        .build();

String absolutePath = getAbsolutePath();
CATEGORIES.forEach(category -> {

    String path = absolutePath + DLMTR + DATA_ROOT + DLMTR + category;
    Set<Path> ocrFiles = getOcrFiles(path);
    System.out.println(path + ": Found image files: " + ocrFiles);''',
    # python
    '''class Polygon:
    def sides_no(self):
        pass

class Triangle(Polygon):
    def area(self):
        pass

obj_polygon = Polygon()
obj_triangle = Triangle()

print(type(obj_triangle) == Triangle)   	# true
print(type(obj_triangle) == Polygon)    	# false

print(isinstance(obj_polygon, Polygon)) 	# true
print(isinstance(obj_triangle, Polygon))	# true''',

    # cpp
    '''#include <iostream>
    #include <iostream>
using namespace std;

int main() {
int n;

cout << "Enter an integer: ";
cin >> n;

if ( n % 2 == 0)
cout << n << " is even.";
else
cout << n << " is odd.";

return 0;
}''',

# javascript
'''
console.log("Hello World");

var canvas = document.getElementById("canvas");
var c = canvas.getContext("2d");
var tx = window.innerWidth;
var ty = window.innerHeight;
canvas.width = tx;
canvas.height = ty;
//c.lineWidth= 5;
//c.globalAlpha = 0.5;

var mousex = 0;
var mousey = 0;

addEventListener("mousemove", function() {
  mousex = event.clientX;
  mousey = event.clientY;
});


var grav = 0.99;
c.strokeWidth=5;
function randomColor() {
  return (
    "rgba(" +
    Math.round(Math.random() * 250) +
    "," +
    Math.round(Math.random() * 250) +
    "," +
    Math.round(Math.random() * 250) +
    "," +
    Math.ceil(Math.random() * 10) / 10 +
    ")"
  );
}


''',
# plain text
            '''World War II or the Second World War, often abbreviated as WWII or WW2, was a global conflict lasted from 1939 to 1945. The vast majority of the world's countries, including all of the great powers, fought as part of two opposing military alliances: the Allies and the Axis. Many participants threw their economic, industrial, and scientific capabilities behind this total war, blurring the distinction between civilian and military resources. Aircraft played a major role, enabling the strategic bombing of population centres and the delivery of the only two nuclear weapons ever used in war. World War II was by far the deadliest conflict in history, resulting in an estimated 70 to 85 million fatalities, mostly among civilians. Tens of millions died due to genocides (including the Holocaust), starvation, massacres, and disease. In the wake of the Axis defeat, Germany and Japan were occupied, and war crimes tribunals were conducted against German and Japanese leaders.'''
            ]
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
predictions

array(['scala', 'java', 'python', 'cpp', 'java', 'text'], dtype='<U10')

In [137]:
e = [
    # some text mixed with code
    # python
    '''
    Python Scope
A variable is only available from inside the region it is created. This is called scope.

Local Scope
A variable created inside a function belongs to the local scope of that function, and can only be used inside that function.

ExampleGet your own Python Server
A variable created inside a function is available inside that function:

def myfunc():
  x = 300
  print(x)

myfunc()
Function Inside Function
As explained in the example above, the variable x is not available outside the function, but it is available for any function inside the function:
    ''', 
    #java
'''
Read a File
In the previous chapter, you learned how to create and write to a file.

In the following example, we use the Scanner class to read the contents of the text file we created in the previous chapter:

ExampleGet your own Java Server
import java.io.File;  // Import the File class
import java.io.FileNotFoundException;  // Import this class to handle errors
import java.util.Scanner; // Import the Scanner class to read text files

public class ReadFile {
  public static void main(String[] args) {
    try {
      File myObj = new File("filename.txt");
      Scanner myReader = new Scanner(myObj);
      while (myReader.hasNextLine()) {
        String data = myReader.nextLine();
        System.out.println(data);
      }
      myReader.close();
    } catch (FileNotFoundException e) {
      System.out.println("An error occurred.");
      e.printStackTrace();
    }
  }
}
The output will be:

Files in Java might be tricky, but it is fun enough!
''', 
    #javascript
'''Example
Errors Will Happen!
When executing JavaScript code, different errors can occur.

Errors can be coding errors made by the programmer, errors due to wrong input, and other unforeseeable things.

Example
In this example we misspelled "alert" as "adddlert" to deliberately produce an error:

let historybutton = document.getElementById('historybutton');
let history = document.getElementById('history');
let bar1 = document.getElementById('bar1');
let bar2 = document.getElementById('bar2');
let dis=document.getElementById('answer');

function showHistory() {
    let calcHistory = JSON.parse(localStorage.getItem("calcHistory")) || [];
    let len = calcHistory.length;

    history.innerHTML = '';


    bar1.style.display = 'block';
    bar2.style.display = 'block';
''', 
# text 
    '''In the realm of probabilities, where mysteries reside,
Lies a theorem elegant, with secrets to confide.
It's the wisdom of Bayes, a noble guide,
To discern truth's path, where uncertainties hide.

In the heart of data, where chaos takes its toll,
Bayes' theorem unfolds, like a story to extol.
It starts with a prior, a belief from days of old,
Then updates with evidence, as the story's told.

Posterior probabilities, the theorem does reveal,
They're the answers we seek, with a truth to seal.
With each new observation, the truth becomes ideal,
Bayesian inference, a powerful, robust ordeal.

Conditional probabilities, like threads in a weave,
Bayes' theorem connects them, so we may perceive,
How beliefs evolve, as new data does achieve,
A rational framework, for our minds to believe.

In medical diagnosis or in weather's forecast,
In spam email detection, or stock market amassed,
Bayes' theorem shines, as a beacon unsurpassed,
Guiding us through uncertainty, from first to the last.

So let us raise a toast to Bayes' timeless grace,
In the world of probabilities, it finds its place.
With evidence and reason, it helps us embrace,
The art of making decisions in this complex space.

In the realm of numbers, where doubt may appear,
Bayes' theorem stands strong, removing our fear.
With its Bayesian insight, crystal clear,
We navigate life's uncertainties, drawing near.''', 
    'hello world',
    'we use scala to write code', 
    'python is a great programming language',
   ]

e2c = vectorizer.transform(e)
p = classifier.predict(e2c)
p

array(['python', 'java', 'java', 'text', 'scala', 'scala', 'text'],
      dtype='<U10')