# Importing Required Modules

In [53]:
import en_core_web_sm
import os
import pandas as pd
import random
import re
import spacy
import string 
import spacy
from spacy.util import minibatch

In [2]:
nlp = spacy.load('en_core_web_sm')

### Creating a DataFrame from the files presnt in the provided path having 2 columns named "file_type" containing the type of file e.g. py, csv, txt, json and  "content" i.e. the actual data/content of the file. Change the value of the paths variable

In [13]:
def preprocess(text):
    doc = nlp(text)
    return [word.lemma_ for word in doc if not word.is_stop and str(word) not in string.punctuation and str(word) not in ["\t","\n"] and str(word).isalpha()]


In [15]:
path = "sample_files2/"
file_data = {"file_type" : [], "content" : []}
for root, dirs, files in os.walk(path):
    for file_name in files:        
        file_name = os.path.join(root, file_name)
        if "." in file_name and "json" not in file_name:
            data = open(file_name, encoding = "ISO-8859-1")
            content = data.read()
            content = ' '.join(preprocess(content))
            
            type_of_file = file_name.split(".")[-1]
            file_data["file_type"].append(type_of_file)
            file_data["content"].append(content)
            
        #'''

## Storing all the available file types in a list to create the label for our training model

In [16]:
data_df = pd.DataFrame(file_data)
file_types = set(data_df.file_type)

In [17]:
print(file_types)

In [18]:
data_df.head()

Unnamed: 0,file_type,content
0,java,import scanner public class JavaExample public...
1,java,import public class SimpleExample extend Frame...
2,java,import scanner public class BinarySearch publi...
3,java,import scanner public class JavaExample public...
4,java,public class deadlockdemo public void synchron...


## Building a Bag of Words model

In [19]:
# Create an empty model
nlp = en_core_web_sm.load()

# Create the TextCategorizer with exclusive classes and "bow" architecture
textcat = nlp.create_pipe(
    "textcat",
    config={"exclusive_classes": True, "architecture": "bow"}
)

# Add the TextCategorizer to the empty model
nlp.add_pipe(textcat)

**Adding label to our TextCategorizer. Labels are nothing but all the availabe file types**

In [20]:
for types in file_types:
    textcat.add_label(types)



## Training a Text Categorizer Model

In [21]:
train_texts = data_df['content'].values
train_labels = []
for label in data_df['file_type']:
    cat = {"cats":{}}
    for types in file_types:
        cat["cats"][types] = label == types
    train_labels.append(cat)

In [22]:
print(train_labels[:5])

**Combine the texts and labels into a single list.**

In [23]:
train_data = list(zip(train_texts, train_labels))

In [24]:
print(train_data[:1])

##  Using loop for more epochs, and re-shuffling the training data at the begining of each loop.

In [26]:
random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
for epoch in range(10):
    random.shuffle(train_data)
    # Create the batch generator with batch size = 8
    batches = minibatch(train_data, size=8)
    # Iterate through minibatches
    for batch in batches:
        # Each batch is a list of (text, label) but we need to
        # send separate lists for texts and labels to update().
        # This is a quick way to split a list of tuples into lists
        texts, labels = zip(*batch)
        nlp.update(texts, labels, sgd=optimizer, losses=losses)
    print(losses)

## Making Predictions. Pass the path of the testing files to paths variable

In [28]:
import sklearn

In [51]:
paths = [r"test_files//"]

expected_file_data = {"file_type" : [], "content" : []}
texts = []

for path in paths:
    files = os.listdir(path)
    files = sorted(files)

    for file_name in files:
        print(file_name)

        if "." in file_name and "json" not in file_name:

            data = open(path+file_name,encoding = "ISO-8859-1")
            content = data.read()
            content = content.replace("\n"," ")
            type_of_file = file_name.split(".")[-1]
            if type_of_file == "ipynb":
                type_of_file = "py"
            expected_file_data["file_type"].append(type_of_file.strip())
            
            expected_file_data["content"].append(content)
            
texts = expected_file_data["content"]
docs = [nlp.tokenizer(text) for text in texts]
    
# Use textcat to get the scores for each doc
textcat = nlp.get_pipe('textcat')
scores, _ = textcat.predict(docs)

print(scores)

## Comparing Predictions

In [52]:
# From the scores, find the label with the highest score/probability
predicted_labels = scores.argmax(axis=1)
predicted = [textcat.labels[label] for label in predicted_labels]

result = {"File Name" : files, "Predicted file type" : predicted, "Expected file type" : expected_file_data["file_type"]}

result = pd.DataFrame(result)
result

Unnamed: 0,File Name,Predicted file type,Expected file type
0,SARS-prediction-with-B-cell-data.ipynb,py,py
1,bubble_Sort.java,java,java
2,converting_to_sentence.py,py,py
