# <center>NLP for Topic Modeling </center>
---
## Noun Extraction: 
The function takes in the folder name. <br />
The function returns a list of files with only a list of all nouns remaining. (a bag of words) <br />
* PARAM: foldername : str
* RETURNS: res : list[list[string]]

1. nltk
2. spaCy
3. pattern
4. almawadie


In [1]:
import os
from glob import glob

def get_file_names(foldername):
    cwd = os.getcwd()
    files = glob(cwd + f"\\{foldername}\\*.txt")
    files = [f"{foldername}\\"+os.path.basename(x) for x in files]
    return files

### nltk (natural language toolkit)

In [2]:
import nltk
def noun_extraction_nltk(foldername):
    files = get_file_names(foldername)
    res = []
    for file in files:
        nouns = []
        with open(file) as f:
            text = f.read()
            tokens = nltk.word_tokenize(text)
            pos_tagged = nltk.pos_tag(tokens)
            nouns = list(map(lambda x: x[0], filter(lambda x:x[1]=='NN', pos_tagged)))
        res.append(nouns)
    return res

### spaCy

In [3]:
import spacy
nlp = spacy.load("en_core_web_lg")
def noun_extraction_spacy(foldername):
    files = get_file_names(foldername)
    texts = []
    for file in files:
        with open(file) as f:
            texts.append(f.read())
    res = []
    for doc in nlp.pipe(texts, disable=["ner", "lemmatizer", "parser", "senter"]):
        res.append([token.text for token in doc if token.pos_ == "NOUN"])
    return res

### pattern (what gensim used to use)

In [4]:
from pattern.text.en import tag
def noun_extraction_pattern(foldername):
    files = get_file_names(foldername)
    res = []
    for file in files:
        nouns = []
        with open(file) as f:
            text = f.read()
            tokens = nltk.word_tokenize(text)
            tokens = tag(tokens, tokenize=False)
            nouns = list(map(lambda x: x[0], filter(lambda y: "NN" in y[1][:2], tokens)))
        res.append(nouns)
    return res
            

### almawadie (arabic for topic)

In [5]:
import tools

def noun_extraction_almawadie(foldername):
    noun_list = tools.read_lines()
    files = get_file_names(foldername)
    files = [bytes(file, 'utf-8') for file in files]
    
    res = tools.doc_to_list(files, noun_list)
    return res

## <center> Testing the Noun Extraction</center>

In [9]:
foldername = "scripts_small"
print("nltk: ", noun_extraction_nltk(foldername)[0][1:10])
print("spacy: ", noun_extraction_spacy(foldername)[0][1:10])
print("pattern: ", noun_extraction_pattern(foldername)[0][1:10])
print("almawadie: ", noun_extraction_almawadie(foldername)[0][1:10])

nltk:  ['school', 'head', 'building', 'eighteen', 'granny', 'dress', 'cup', 'coffee', 'backpack']
spacy:  ['Taming', 'Revision', 'school', 'Loners', 'Crowd', 'sleep', 'eyes', 'building', 'granny']
pattern:  ['Karen', 'McCullah', 'Lutz', 'amp', 'Kirsten', 'Smith', 'Shrew', 'William', 'Revision']
almawadie:  [b'lutz', b'amp', b'smith', b'william', b'shakespeare', b'revision', b'november', b'welcome', b'high']


In [11]:
%timeit noun_extraction_nltk(foldername)
%timeit noun_extraction_spacy(foldername)
%timeit noun_extraction_pattern(foldername)
%timeit noun_extraction_almawadie(foldername)

15.5 s ± 1.28 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
26.1 s ± 1.61 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
19.8 s ± 1.55 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
108 ms ± 27.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
