# Pre-processing the NICT JLE dataset

## TODO: explanation : regular expressions - used pattern for tags

In [6]:
import re
from os import listdir

# This function allows you to retrieve all the lines that are between the "B" tag and remove the new line
def generateCleanFile(filename):
    input = open("NICT_JLE_4.1/LearnerOriginal/" + filename, "r", encoding="latin1") 
    output = open("preProcessedData/" + filename, "w", encoding="latin1")
    lines = input.readlines()
    for line in lines:
        if '<B>' in line :
            output.write(line[3:len(line)-5])
            #output.write(line[3:len(line)-5])
    input.close()
    output.close() 

# This function allows each line of each file to delete all tags and their data
def generateCleanline(filename):
    input = open("preProcessedData/" + filename, "r", encoding="latin1")
    output = open("preProcessedDataLine/" + filename, "w", encoding="latin1")
    lines = input.readlines()
    for line in lines:
            content = removeAllInternalTags(line)
            output.write(content.lower())
    input.close()
    output.close() 


def removeAllInternalTags(line):
    pile = list()
    pile = [(m.start(0), m.end(0)) for m in re.finditer(r'<(.*?)>', line)]

    buffer = []
    counter = 0
    while len(pile) != 0:
        if line[pile[counter][0]+1] == '/':
            line = line[0:buffer[-1][0]] + line[pile[counter][1]:]
            pile = [(m.start(0), m.end(0)) for m in re.finditer(r'<(.*?)>', line)]
            counter = 0
            buffer.pop()
        else :
            buffer.append(pile[counter])
            counter+=1
    return line

# Keep only signaficant tags that will help with the prediction of the SST level of each participant 
# tags is the list of tags that the user wants to remove from the transcript 
def removeInternalTags(line, tags=[]):
    for tag in tags:
        doubleTagPattern = "<",tag,">(.+?)</",tag,">"
        singleTagPattern = "</?",tag,"*?>"
        line = re.sub(doubleTagPattern, "", line)
        line = re.sub(singleTagPattern, "", line)
    return re.sub(r"\s+", " ", line)


files = [f for f in listdir("NICT_JLE_4.1/LearnerOriginal")]

for f in files:
    generateCleanFile(f)
    
for f in files:
    generateCleanline(f)


# Processing the dataset: extract features with the Bag of Word

explanation : BoW algo - stopwords


A bag-of-words is a representation of text that describes the occurrence of words within a document. It involves two things:

    A vocabulary of known words.
    A measure of the presence of known words.

It is called a “bag” of words, because any information about the order or structure of words in the document is discarded. The model is only concerned with whether known words occur in the document, not where in the document.

In [24]:
import nltk
from nltk.corpus import stopwords
from os import listdir
import numpy
import re

def createVocab():
    files = [f for f in listdir("preProcessedDataLine/")]
    vocab = set()
    for filename in files:
        f = open("preProcessedDataLine/" + filename, "r", encoding="latin1")
        vocab.update(tokenize(f))
        f.close()
    return vocab

def word_extraction(sentence):    
    ignore = set(stopwords.words('english'))   
    words = re.sub("[^\w]", " ",  sentence).split()    
    cleaned_text = [w.lower() for w in words if w not in ignore]    
    return cleaned_text

def tokenize(sentences):    
    words = []    
    for sentence in sentences:        
        w = word_extraction(sentence)        
        words.extend(w)            
        words = sorted(list(set(words)))  
    return words

def generate_bow(filename, vocab):
    f = open("preProcessedDataLine/" + filename, "r", encoding="latin1") 
    lines = f.readlines()
    vector = list()        
    for line in lines:        
        words = word_extraction(line)        
        vector = numpy.zeros(len(vocab))        
        for w in words:            
            for i,word in enumerate(vocab):                
                if word == w:                     
                    vector[i] += 1 
    f.close()                 
    return vector

    

#files = [f for f in listdir("preProcessedDataLine")]
vocab = createVocab()
print(generate_bow("file01281.txt", vocab))


# TODO : save the BoW output in a variable as our input for the classifier 



[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 3. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

# Training the classifier to Predict SST

# Accuracy and Confusion matrix