## Raw data pre-processing 
Data is collected from 11 MOOCs in the field of Computer Science, robotics, mathematics and physics and are processed total of 563 TXT files. 

#### Preprocessing steps: 

- sentence by sentence split
- lower case
- noise removal 
    - SOME punctuation STAYS 
        - point, single space and comma
    - SOME stopwords stay 
        - between,we,i,in,here,that,you,it,that,this,there,few,if,so,to,a,an,is,until,while
    - mention removal
- word normalization
    - tokenization 
    - lemmatization 
    - stemming 
- word standardization
    - regex

#### Overall logic:
1. traverse recursively all folder and files
2. When a file is found, save it's name into a file list
3. For each file in the file list, apply all the **preprocessing steps** and save it as a new file with "\_PREPROCESSED" added at the end in the same folder

#### Next steps: 
1. Apply TF-IDF
2. Try Wikipedia linking
3. Try linking with WordNet
4. Try Bag of Words
5. Try other algorithms? 
6. Define a clear dictionary with words for each category
7. Other Classification algorithms?

In [281]:
# Import all necessary modules for EVERYTHING here

import math
from textblob import TextBlob as tb
import nltk
from nltk.corpus import wordnet as wn
from beautifultable import BeautifulTable
import os
import sys
import os.path
import string
import time

# ---- for TF-IDF & NLTK
import math
from textblob import TextBlob as tb
import nltk
from nltk.corpus import wordnet as wn
from beautifultable import BeautifulTable
#nltk.download('punkt')
#nltk.download('wordnet')
from pathlib import Path

import re, string, unicodedata
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
#nltk.download('stopwords')
from nltk.stem import LancasterStemmer, WordNetLemmatizer

from nltk import sent_tokenize, word_tokenize, pos_tag
# nltk.download('averaged_perceptron_tagger')
from tabulate import tabulate

In [290]:
# ------------------------------------------------ METHODS  --------------------------------------------------------

# Create a set of all allowed characters.
# {...} is the syntax for a set literal in Python.
allowedPunct = {",", ".", " "}.union(string.ascii_lowercase)

#stopWords = stopwords.words('english')
#print(stopWords)

# function #1    
def splitSentences(iFile, oPathNoExt):
    if iFile.name.endswith("_Raw.txt"):
        print("[Splitting sentences on file:] " + iFile.name)
        baseName = oPathNoExt.split(".en", 1)[0]
        OFName = baseName + ".en_sent.txt"
        
        # the WITH keyword makes it possible to omit the file.close() function at the end to close the file
        # all file processing happens there
        # create a file and open it to write inside
        with open(OFName,"w") as oFile:
            print(oFile.name)
            text = iFile.read()    
            # initial text is full of new lines, so we have to remove them first. 
            text = text.replace("\n", " ")    
            # have the sentences split and print them one by one
            sentences = sent_tokenize(text)
            # write to the output file
            for sent in sentences:
                oFile.write(sent+"\n")
        oFile.close()
    

# function #2 (optional)
# only leaves comma, space, dot
# read input file, save to output file
def removePunctuation(iFile, oPathNoExt):
    #text = re.sub("[\[\].;@#$%^&*:()][^,]", " ", file)
    if filePath.endswith("_sent.txt"):
        print("[Punctuation removal on file: ] " + iFile.name)
        with open(oPathNoExt + "_noPunct.txt","w") as oFile:
            oFile.write("".join([letter for letter in iFile if letter in allowed]))
        oFile.close()
    else:
        pass
      
# function #3
def sentTokenize(iFile, oPathNoExt):
    if iFile.name.endswith("_sent.txt"):
        print("[Tokenizing file: ] " + iFile.name)
        baseName = oPathNoExt.split(".en", 1)[0]
        # print("BASE NAME: ", baseName)
        OFName = baseName + ".en_tokens.txt"
        
        with open(OFName, "w") as oFile:
            tokens = iFile.read()
            tokens = word_tokenize(tokens)
            for tok in tokens:
                oFile.write(tok+"\n")
        oFile.close()
    else:
        pass
    

# function #4 - partOfSpeechTag Tagging
def POStag(iFile, oPathNoExt):
    if iFile.name.endswith("_tokens.txt"):
        print("[Part of Speech tagging: ] " + iFile.name)
        baseName = oPathNoExt.split(".en", 1)[0]
        # print("BASE NAME: ", baseName)
        OFName = baseName + ".en_tokPOStag.txt"
        
        tokenList = iFile.read().split()
        
        with open(OFName, "w") as oFile:            
            taggedTok = pos_tag(tokenList)
            tokens = []
            
            for tok in taggedTok:
# UNallowedPunct = {",", ".", "[", "]", "*", "/", "+", "-", "%", "#", "(", ")", "-", "_", ";", ":", "'", "\"", "`"}.union(string.ascii_lowercase)
                UNallowedPunct = {",", ".", "[", "]"}.union(string.ascii_lowercase)
                if tok[0] in UNallowedPunct:
                    # if token is a punctuation symbol, don't save in the output
                    pass
                else:
                    tokens.append(tok)
                    oFile.write(str(tok)+"\n")
                    
        oFile.close()
    else:
        pass

In [299]:
# ----------------------------------------------- PROGRAM  -------------------------------------------------------

# LOCATION OF TEST FILES: /media/sf_Shared_Folder/TEST/RAW
# path = "/media/sf_Shared_Folder/Courses/Coursera Downloads Processed"
# path = "/media/sf_Shared_Folder/TEST/RAW"
path = "/media/sf_Shared_Folder/Coursera Downloads PreProcessed"

counter = 0

for root, subdirs, files in os.walk(path):

    for curFile in os.listdir(root):

        filePath = os.path.join(root, curFile)

        if os.path.isdir(filePath):
            pass

        else:
            # check for file extension and if not TXT, continue and disregard the current file
            if not filePath.endswith(".txt"):
                pass
            else: 
                # else create a new txt file with "_PROC.txt" to store the output and process the original file
                try: 
                    counter += 1
                    #fileName = print(os.path.abspath(filePath))
                    curFile = open(filePath, 'r', encoding = "ISO-8859-1") #IMPORTANT ENCODING! UTF8 DOESN'T WORK
                    #outpFile = open(os.path.abspath)

                    fileExtRemoved = os.path.splitext(os.path.abspath(filePath))[0]
                    #outpFileBase = open(FileExtRemoved + "_PROC.txt","w")

                    """
                    call each processing function here and pass it the file
                    First argument: current input file
                    Second argument: path without extension of the current file
                    the path will be used to save the output file with the same name and same location
                    but with different file ending based on what the fuunction did
                    """  
                    #removePunctuation(curFile, fileExtRemoved)
                    splitSentences(curFile, fileExtRemoved)
                    sentTokenize(curFile, fileExtRemoved)
                    POStag(curFile, fileExtRemoved)
                    
                finally: 
                    curFile.close()
        
print("\nTotal number of {} {} files found.".format(counter, "TXT"))


Total number of 556 TXT files found.


## TF-IDF implementation
### --------------------------------------------------------------------------------------------------------------------------------------

IF-IDF is implemented in order to check whether the terms extracted from LOs will have anything in common with the terms that would be extracted with manual MOOC analysis and to compare with of the two methods will bring better results in the classification part

Below is the main TF-IDF implementation without any text provided to it yet.

##### Term frequency
\\( tf(t,d) = 0.5 + 0.5 * (\frac{f_{t,d}}{f_{t',d}:t' \in d}) \\) 

##### Inversed document frequency
\\( idf(t,D) = log * (\frac{N}{d \in D  :  t \in d}) \\)

##### Computing tf-idf
\\( tfidf(t,d,D) = tf(t,d) * idf(t,D) \\)

In [None]:
# blob is the the text where to look for the word
def tf(term, doc):
    #return ratio between nr of certain word count and total document word count
    return doc.words.count(term) / len(doc.words)

def docsWithTermIn(term, doclist):
    return sum(1 for doc in doclist if term in doc.words)

def idf(term, doclist):
    return math.log(len(doclist) / (1 + docsWithTermIn(term, doclist)))

def tfidf(term,doc,doclist):
    return tf(term, doc) * idf(term,doclist)

### Running TF-IDF with data

#### TODO: Fix the input, it takes strings, and not files right now

In [None]:
# traverse each folder and sub-folder
# create an array of files to add each file in it
# if the file is TXT, add to the array
# create a String array of documents with the file of the array with files 
# so we can store the contents of each inside
# read each line of each file and save to the strings
# process each string by tokenization, lemmatization etc. 
# perform tf-idf on the documents

# 01-understanding-research-data/01_research-data-defined.en.txt
document1 = tb("""[sound] so, who knows what a function is, right
but i know what it does
it takes an input value, and produces an output value
and we've got a whole bunch of functions, right
and we can take these functions and start asking questions about them
""")

# 01-understanding-research-data/02_types-of-data-and-metadata.en.txt
document2 = tb("""
take a look at the further reading, whereyou'll find additional resources to learn more about file formats, compression,normalization, and data transformations
you may wish to move on to the next moduleand return to these references later
we recommend that you move on tothe module about documentation and data citation.""")

# 01-understanding-research-data/03_research-data-lifecycle.en.txt
document3 = tb("""her like lego pieces in a big lego drawing
and, most importantly, are there traps
are there issues that arise because of these switches that we don't fully know how to deal with
now, this module will deal with all of this, """)




# ------------------------------------ TF-IDF --------------------------------------------------------

# arrays to hold the terms found in text and also a custom list to test domain-specific terms
exportedList = []
ownList = {"data management","database","example","iot","lifecycle","bloom","filter","integrity",
           "java","pattern","design pattern","svm","Support vector machine","knn","k-nearest neighbors","machine learning"}

table = BeautifulTable()
table.column_headers = ["TERM", "TF-IDF"]

doclist = [document1, document2, document3]
#doclist = [document4, document5, document6]
docnames = ["01_research-data-defined.en.txt","02_types-of-data-and-metadata.en.txt","03_research-data-lifecycle.en.txt"]
topNwords = 15;

for i, doc in enumerate(doclist):
    print("\nTop {} terms in document {} | {}".format(topNwords, i + 1, docnames[i]))
    scores = {term: tfidf(term, doc, doclist) for term in doc.words}
    sortedTerms = sorted(scores.items(),key=lambda x: x[1], reverse=True)
    
    for term, score in sortedTerms[:topNwords]:
         table.append_row([term, round(score, 5)]) 
         exportedList.append(term)
    
    print(table)
#    print(exportedWords, "\n")



# ----------------------------------------- NLTK, WORDNET -------------------------------------------
print("\n\n------- EXPORTED TERMS in WORDNET ----------") 
for word in exportedList:
    if not wn.synsets(word):
        print("\n", word, ": NO SYNSETS\n")
    else:
        print("\n", word)
        for ss in wn.synsets(word):
            print("- ",ss.name()," | ",ss.definition())

print("\n\n------- CUSTOM TERMS in WORDNET (also domain specific) ----------")    
for word in ownList:
    if not wn.synsets(word):
        print("\n", word, ": NO SYNSETS\n")
    else:
        print("\n", word)
        for ss in wn.synsets(word):
            print("- ",ss.name()," | ",ss.definition())
    