# Brief introduction to NLTK

NLTK (the Natural Language Toookit) is a collection of Python libraries for processing text. Tokenizers, word-stemmers and lemmatizers, part-of-speech taggers, and many other tools are defined. 

* **Website**: https://www.nltk.org/ 
* **Free book**: 	Steven Bird, Ewan Klein, and Edward Loper. *Natural Language Processing with Python -- Analyzing Text with the Natural Language Toolkit*. https://www.nltk.org/book/

In [1]:
import string
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import matplotlib.pyplot as plt

In [2]:
# These will need to be downloaded.
# This only needs to be run once. 
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nimda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nimda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nimda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\nimda\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\nimda\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

### Some helpful functions

In [35]:
stop_default = stopwords.words('english')

def read_file(f, lower=True):
    """Simple wrapper to read a file."""
    with open(f) as reader:
        text = reader.read()
        if lower:
            return text.lower()
        return text

def get_tokens(text):
    tokens= text.split()
    return cleanup(tokens)
    
def cleanup(tokens, nopunct=True, nowhite=True, nonum=True, stop=stop_default):
    """Use some basic python to clean up tokens.
    stopwords, punctuation, digits are removed."""

    toremove=""
    if nopunct:
        toremove = toremove + string.punctuation
    if nonum:
        toremove = toremove + "0123456789"
    if nowhite:
        toremove = toremove + string.whitespace
    if toremove:
        tab = "".maketrans("", "", toremove)
        tokens = [t.translate(tab).strip() for t in tokens]
        tokens = [t for t in tokens if t]
    return remove_tokens(tokens, stop)

def remove_tokens(tokens, stopwords):
    """Takes a list of tokens and returns a list with provided stopwords removed."""
    return [t for t in tokens if t not in stopwords]

Compare this to our earlier method of tokenization.

## NLTK Functions

In [15]:
def word_tokenization_2(sent):
    """Split text into sentences using NLTK sentence tokenizer.
    Print out 10 sentences, starting at index 1000. 
    """
    words = word_tokenize(sent)
    for w in words:
        print(f'<{w}>')
            
def stem(tokens):
    """Using a Porter Stemmer
    """
    stemmer = PorterStemmer()
    return [stemmer.stem(tok) for tok in tokens]

def lemmatizer(tokens, pos= ['n', 'v', 'a', 'r']):
    """Using a WordNetLemmatizer
    """
    lem = WordNetLemmatizer()    
    for p in pos:
        print(p)
        print('-'*30)
        lemmatized = [(tok,lem.lemmatize(tok,pos=p)) for tok in tokens]
        for (tok,ste) in lemmatized:
            X = 'X'
            if tok == ste:
                X = ''
            print(f'{X}\t{tok}\t->\t{ste}')
            
def pos(tokens, as_string=False):
    tagged = nltk.pos_tag(tokens)
    result = tagged
    if as_string:
        result = ['__'.join(pair) for pair in tagged]
    return result

In [36]:
import numpy as np
import pandas as pd
from pathlib import Path

def process_file(file, n=20, lower=True, tokenizer=None):
    # read file
    text = read_file(file, lower = lower)

    # tokenize file
    
    if not tokenizer:
        tokens= get_tokens(text)
    else:
        tokens = tokenizer(text)

    # determine number of rows given bucket size
    rows = len(tokens) // n

    # truncate input to ensure even splits
    tokens = tokens[:(rows*n)]
    
    # use numpy to reshape input into rows of n tokens
    country = np.array(tokens)
    country = country.reshape((rows,n))
    rows, columns = country.shape
    
    # recombine each row into a string. 
    results = [0] * rows
    for i in range(rows):
        results[i] = ' '.join(country[i])

    return results

def process_files(files, n = 50):
    dataframes = []
    for [f,title,author] in files:
        working_file = working_dir/f
        strings = process_file(working_file, n=n)    
        df = pd.DataFrame(strings, columns=['text'])
        df['author'] = author
        df['title'] = title
        dataframes.append(df)
    corpus_data = pd.concat(dataframes)
    return corpus_data

working_dir = Path("C://Users//nimda//Desktop//CSCI 6380 Data Mining//hw//hw2//raw")

train_files =  [["house_of_the_7_gables.txt","The House of the Seven Gables","Nathaniel Hawthorne"],
            ["moby_dick.txt","Moby Dick","Herman Melville"],
            ["mosses.txt","Mosses from an Old Manse and Other Stories","Nathaniel Hawthorne"],
            ["piazza_tales.txt","The Piazza Tales","Herman Melville"],
            ["return_sherlock.txt","The Return of Sherlock Holmes","Arthur Conan Doyle"],
            ["scarlett_letter.txt","The Scarlet Letter","Nathaniel Hawthorne"],
            ["white_company.txt","The White Company","Arthur Conan Doyle"],
            ["white_jacket.txt","White Jacket","Herman Melville"]]

test_files = [  ["baskervilles.txt","The Hound of the Baskervilles","Arthur Conan Doyle"],
                ["blithedale.txt","The Blithedale Romance","Nathaniel Hawthorne"],
                ["typee.txt","Typee","Herman Melville"]]

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
import numpy as np

def naive_bayes(n = 25):
    train_df = process_files(train_files, n = n)
    test_df = process_files(test_files, n = n)
    
    X_train= train_df['text']
    targets_train = train_df['author']
    X_test= test_df['text']
    targets_test = test_df['author']
    
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X_train)
    X_new_counts = count_vect.transform(X_test)

    clfMNB = MultinomialNB().fit(X_train_counts, targets_train)
    predicted = clfMNB.predict(X_new_counts)    
    mnb = np.mean(predicted == targets_test)

    clfBNB = BernoulliNB().fit(X_train_counts, targets_train)
    predicted = clfBNB.predict(X_new_counts)    
    bnb = np.mean(predicted == targets_test)
    
    print(f"N={n} MultinomialNB: {mnb}; BernoulliNB: {bnb}")    

In [38]:
for n in [10,25,50,250,500,1000]:
    naive_bayes(n)

N=10 MultinomialNB: 0.6599951183793019; BernoulliNB: 0.6679684321861524
N=25 MultinomialNB: 0.7414038657171923; BernoulliNB: 0.7611393692777213
N=50 MultinomialNB: 0.8017908017908018; BernoulliNB: 0.833943833943834
N=250 MultinomialNB: 0.8938775510204081; BernoulliNB: 0.9346938775510204
N=500 MultinomialNB: 0.9139344262295082; BernoulliNB: 0.9590163934426229
N=1000 MultinomialNB: 0.9504132231404959; BernoulliNB: 0.9834710743801653
