# CSCI6380 Data Mining: Assignment #2
### Adnan Kivanc Corut
#### September 24, 2020

## Importing Libraries

In [1]:
from pathlib import Path
import glob
import os
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

nltk.download('stopwords')

stops = stopwords.words('english') + list(string.punctuation) + list(string.digits)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kivanc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Functions

In [15]:
def clean_guthenberg(files):
    """
    This function tries to remove the generic 
    words at the start of the text files from 
    the Project Gutenberg ebooks.
    """    
    for f in files:
        lines = open(f, 'r').readlines()

        strings = ("***", "<<<<<<<<", "* * * * *")
        for i, line in enumerate(lines):
           if any(s in line for s in strings):
                break

        if i < len(lines) - 1:
            with open(f + '_edited', 'w') as f:
                f.write('\n'.join(lines[i + 1:]))


def get_input_list(file_paths):
    """
    This function takes a list of paths of files and 
    returns a 2d list that consists name of the text 
    file and the name of the author as output.
    """

    # Get names of the text files (novels)
    file_names=[]
    for f in file_paths:
        file_names.append(f.rsplit('/', 1)[1])

    # Get names of the author of the novel
    author_names=[]
    for f in file_paths:
        temp= f.rsplit('/', 1)[1]
        temp_2=temp.rsplit('-', 1)[0]
        author_names.append(temp_2.replace('-', ' '))
    
    # Using zip() function, aggregate "file names" and "author names" in a tuple, 
    # then convert it into a list and return
    out_list = [list(x) for x in zip(file_names, author_names)]
    return out_list
    print(type(out_list))


def read_tokenize_clean(in_file, clean=False):
    """
    This function takes a txt file as an input, 
    reads the file, lowercase all words, and 
    then tokenize them using `word_tokenize()`.
    Lastly, if argument `clean` is true it removes 
    stopwords, punctuation, and digits from tokens.
    """

    with open(in_file) as reader:
        # Read txt file and lowercase
        text = reader.read().lower()
        
        # Tokenization
        tokens = word_tokenize(text)
        
        if clean:
        # Remove stopwords, punctuation, and digits if clean=True
            clean_tokens=[]
            for w in tokens: 
                if w not in stops:
                    clean_tokens.append(w)
            return clean_tokens
        return tokens


def perform_stemming(tokens, Snowball=True):
    """
    This function performs stemming on tokens
    to reduce them to their root form. Two different
    stremmer can be selected.
    """

    ## Declare two different s
    p_stemmer = PorterStemmer()
    sb_stemmer = SnowballStemmer("english")

    ## Stemming
    stemmed_words=[]
    if not Snowball:
        for w in tokens:
            stemmed_words.append(p_stemmer.stem(w))
        return stemmed_words
    else:
        for w in tokens:
            stemmed_words.append(sb_stemmer.stem(w))
        return stemmed_words


def reshape_tokens(in_file, n=500, clean=False, stem=False):
    """
    This function takes a text file as an input, and first,
    reads the file, lowercase, tokenize the text. If clean=True 
    it removes stopwords, punctuation, and digits. Then,
    it splits tokens into `n` number of chunks using list 
    comprehension and stores them in a numpy array. Finally, 
    it converts array of tokens into string, and then join them.
    """

    # Read, tokenize and clean the file
    tokens = read_tokenize_clean(in_file, clean)

    # Stemming the tokens
    if stem:
        tokens = perform_stemming(tokens)

    # Split tokens list into n chunks usign list comprehension. 
    # Iterate through tokens list, add tokens to new array called "tokens_array" 
    # by skipping and incrementing n tokens at a time. 
    # Source: https://www.geeksforgeeks.org/break-list-chunks-size-n-python/
    tokens_array = np.asarray([tokens[i:i + n] for i in range(0, len(tokens), n)])

    # Get number of rows of tokens_array
    n_rows = tokens_array.shape[0]

    # Create an empty numpy array
    out_df = np.empty(n_rows, dtype=object) 
    
    # Convert each row of tokens_array into string using `map()`, 
    # and join all the string values
    # Source: https://www.geeksforgeeks.org/python-program-to-convert-a-list-to-string/
    out_df = [' '.join(map(str, tokens_array[i])) for i in range(n_rows)]

    # Return the output
    return out_df  


def make_corpus(input_list, file_paths, n=500, clean=False, stem=False):
    """
    This function takes makes corpus data to feed 
    text classifiers. Takes list of input data with
    the file name and author name, and also takes
    list of paths of files as an argument. It returns
    a pandas data frame with 500 tokens in each row
    with corresponding author in the other column.
    """ 
    corpus = []

    # Iterate through input_list and file_paths.
    # Source: https://www.geeksforgeeks.org/python-iterate-multiple-lists-simultaneously/
    for ([file_names, author_names], path) in zip(input_list, file_paths):
        
        ## Get words as sequence of n tokens
        words = reshape_tokens(path, n, clean, stem)
        
        ## Create a dictionary with words and author names
        data = {'Words':words,
                'Author':author_names}
        
        ## Convert dictionary into a dataframe
        rows_to_add = pd.DataFrame.from_dict(data)
        
    ## Store each data frame in a list in each iteration 
    ## and merge lists of dataframes into a single big corpus.    
        corpus.append(rows_to_add)
    
    ## Return final data frame as corpus data
    corpus = pd.concat(corpus)
    
    return corpus

## Get Input Lists

In [16]:
training_dir = Path("/Users/kivanc/DataMining-ML/Data/text/training_files")
train_files_rgx = os.path.join(training_dir, '*.txt')
train_files_paths = glob.glob(train_files_rgx)

test_dir = Path("/Users/kivanc/DataMining-ML/Data/text/test_files")
test_files_rgx = os.path.join(test_dir, '*.txt')
test_files_paths = glob.glob(test_files_rgx)

training_files = get_input_list(train_files_paths)
test_files = get_input_list(test_files_paths)

## Testing Classifiers and Parameters

### Default Settings (n=500, clean=True, stem=True)

In [30]:
# Data Processing

train_novels = make_corpus(training_files, train_files_paths, n=500, clean=True, stem=True)
#print(train_novels)
test_novels = make_corpus(test_files, test_files_paths, n=500, clean=True, stem=True)
#print(test_novels)

## Check the number of words per row
#print(train_novels['Words'].str.split().str.len())

# Divide data into train data, train target and test data, test target
train_data = train_novels['Words']
train_target = train_novels['Author']
test_data= test_novels['Words']
test_target = test_novels['Author']

# Create a pipeline
# Source: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#building-a-pipeline
txt_clf_mnb = Pipeline([('vect', CountVectorizer()),
                     ('clf_mnb', MultinomialNB())])

txt_clf_mnb = txt_clf_mnb.fit(train_data, train_target)
predicted = txt_clf_mnb.predict(test_data)
mnb_acc = np.mean(predicted == test_target)

txt_clf_bnb = Pipeline([('vect', CountVectorizer()),
                     ('clf_bnb', BernoulliNB())])

txt_clf_bnb = txt_clf_bnb.fit(train_data, train_target)
predicted = txt_clf_bnb.predict(test_data)
bnb_acc = np.mean(predicted == test_target)
        
txt_clf_mnb_tfidf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf_mnb', MultinomialNB())])

txt_clf_mnb_tfidf = txt_clf_mnb_tfidf.fit(train_data, train_target)
predicted = txt_clf_mnb_tfidf.predict(test_data)
mnb_tfidf_acc = np.mean(predicted == test_target)

txt_clf_bnb_tfidf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf_bnb', BernoulliNB())])

txt_clf_bnb_tfidf = txt_clf_bnb_tfidf.fit(train_data, train_target)
predicted = txt_clf_bnb_tfidf.predict(test_data)
bnb_tfidf_acc = np.mean(predicted == test_target)

txt_clf_mnb_fpfalse = Pipeline([('vect', CountVectorizer()),
                     ('clf_mnb', MultinomialNB(fit_prior=False))])

txt_clf_mnb_fpfalse = txt_clf_mnb_fpfalse.fit(train_data, train_target)
predicted = txt_clf_mnb_fpfalse.predict(test_data)
mnb_fpfalse_acc = np.mean(predicted == test_target)

txt_clf_bnb_fpfalse = Pipeline([('vect', CountVectorizer()),
                     ('clf_bnb', BernoulliNB(fit_prior=False))])

txt_clf_bnb_fpfalse = txt_clf_bnb_fpfalse.fit(train_data, train_target)
predicted = txt_clf_bnb_fpfalse.predict(test_data)
bnb_fpfalse_acc = np.mean(predicted == test_target)

In [23]:
print('Default Settings (n=500, clean=True, stem=True):', end='\n * ')
print(f"MultinomialNB: {mnb_acc}, BernoulliNB: {bnb_acc}")
print()
print('Default Settings with TF-IDF:', end='\n * ')
print(f"MultinomialNB: {mnb_tfidf_acc}, BernoulliNB: {bnb_tfidf_acc}")
print()
print('Default Settings with fitPrior=False:', end='\n * ')
print(f"MultinomialNB: {mnb_fpfalse_acc}, BernoulliNB: {bnb_fpfalse_acc}")  

Default Settings (n=500, clean=True, stem=True):
 * MultinomialNB: 0.7384441939120632, BernoulliNB: 0.6910935738444194

Default Settings with TF-IDF:
 * MultinomialNB: 0.6031567080045096, BernoulliNB: 0.6910935738444194

Default Settings with fitPrior=False:
 * MultinomialNB: 0.7406989853438557, BernoulliNB: 0.6944757609921083


In [None]:
### No Cleanning and Stemming (n=500, clean=False, stem=False)

In [24]:
# Data Processing

train_novels_raw = make_corpus(training_files, train_files_paths, n=500, clean=False, stem=False)
#print(train_novels)
test_novels_raw = make_corpus(test_files, test_files_paths, n=500, clean=False, stem=False)
#print(test_novels)

# Divide data into train data, train target and test data, test target
train_data = train_novels_raw['Words']
train_target = train_novels_raw['Author']
test_data= test_novels_raw['Words']
test_target = test_novels_raw['Author']

# Create a pipeline
# Source: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#building-a-pipeline
txt_clf_mnb = Pipeline([('vect', CountVectorizer()),
                     ('clf_mnb', MultinomialNB())])

txt_clf_mnb = txt_clf_mnb.fit(train_data, train_target)
predicted = txt_clf_mnb.predict(test_data)
mnb_acc_raw = np.mean(predicted == test_target)

txt_clf_bnb = Pipeline([('vect', CountVectorizer()),
                     ('clf_bnb', BernoulliNB())])

txt_clf_bnb = txt_clf_bnb.fit(train_data, train_target)
predicted = txt_clf_bnb.predict(test_data)
bnb_acc_raw = np.mean(predicted == test_target)
        
txt_clf_mnb_tfidf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf_mnb', MultinomialNB())])

txt_clf_mnb_tfidf = txt_clf_mnb_tfidf.fit(train_data, train_target)
predicted = txt_clf_mnb_tfidf.predict(test_data)
mnb_tfidf_acc_raw = np.mean(predicted == test_target)

txt_clf_bnb_tfidf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf_bnb', BernoulliNB())])

txt_clf_bnb_tfidf = txt_clf_bnb_tfidf.fit(train_data, train_target)
predicted = txt_clf_bnb_tfidf.predict(test_data)
bnb_tfidf_acc_raw = np.mean(predicted == test_target)

txt_clf_mnb_fpfalse = Pipeline([('vect', CountVectorizer()),
                     ('clf_mnb', MultinomialNB(fit_prior=False))])

txt_clf_mnb_fpfalse = txt_clf_mnb_fpfalse.fit(train_data, train_target)
predicted = txt_clf_mnb_fpfalse.predict(test_data)
mnb_fpfalse_acc_raw = np.mean(predicted == test_target)

txt_clf_bnb_fpfalse = Pipeline([('vect', CountVectorizer()),
                     ('clf_bnb', BernoulliNB(fit_prior=False))])

txt_clf_bnb_fpfalse = txt_clf_bnb_fpfalse.fit(train_data, train_target)
predicted = txt_clf_bnb_fpfalse.predict(test_data)
bnb_fpfalse_acc_raw = np.mean(predicted == test_target)

In [25]:
print('No Cleanning and Stemming (n=500, clean=False, stem=False):', end='\n * ')
print(f"MultinomialNB: {mnb_acc_raw }, BernoulliNB: {bnb_acc_raw }")
print()
print('No Cleanning and Stemming with TF-IDF:', end='\n * ')
print(f"MultinomialNB: {mnb_tfidf_acc_raw }, BernoulliNB: {bnb_tfidf_acc_raw }")
print()
print('No Cleanning and Stemming with fitPrior=False:', end='\n * ')
print(f"MultinomialNB: {mnb_fpfalse_acc_raw }, BernoulliNB: {bnb_fpfalse_acc_raw }")  

No Cleanning and Stemming (n=500, clean=False, stem=False):
 * MultinomialNB: 0.7551020408163265, BernoulliNB: 0.6908909905425585

No Cleanning and Stemming with TF-IDF:
 * MultinomialNB: 0.4659034345445495, BernoulliNB: 0.6908909905425585

No Cleanning and Stemming with fitPrior=False:
 * MultinomialNB: 0.7565953210552514, BernoulliNB: 0.6958685913389746


### No Cleanning and Stemming with n=50 (n=50, clean=False, stem=False)

In [26]:
# Data Processing

train_novels_raw_50 = make_corpus(training_files, train_files_paths, n=50, clean=False, stem=False)
#print(train_novels)
test_novels_raw_50 = make_corpus(test_files, test_files_paths, n=50, clean=False, stem=False)
#print(test_novels)

# Divide data into train data, train target and test data, test target
train_data = train_novels_raw_50['Words']
train_target = train_novels_raw_50['Author']
test_data= test_novels_raw_50['Words']
test_target = test_novels_raw_50['Author']

# Create a pipeline
# Source: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#building-a-pipeline
txt_clf_mnb = Pipeline([('vect', CountVectorizer()),
                     ('clf_mnb', MultinomialNB())])

txt_clf_mnb = txt_clf_mnb.fit(train_data, train_target)
predicted = txt_clf_mnb.predict(test_data)
mnb_acc_raw_50 = np.mean(predicted == test_target)

txt_clf_bnb = Pipeline([('vect', CountVectorizer()),
                     ('clf_bnb', BernoulliNB())])

txt_clf_bnb = txt_clf_bnb.fit(train_data, train_target)
predicted = txt_clf_bnb.predict(test_data)
bnb_acc_raw_50 = np.mean(predicted == test_target)
        
txt_clf_mnb_tfidf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf_mnb', MultinomialNB())])

txt_clf_mnb_tfidf = txt_clf_mnb_tfidf.fit(train_data, train_target)
predicted = txt_clf_mnb_tfidf.predict(test_data)
mnb_tfidf_acc_raw_50 = np.mean(predicted == test_target)

txt_clf_bnb_tfidf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf_bnb', BernoulliNB())])

txt_clf_bnb_tfidf = txt_clf_bnb_tfidf.fit(train_data, train_target)
predicted = txt_clf_bnb_tfidf.predict(test_data)
bnb_tfidf_acc_raw_50 = np.mean(predicted == test_target)

txt_clf_mnb_fpfalse = Pipeline([('vect', CountVectorizer()),
                     ('clf_mnb', MultinomialNB(fit_prior=False))])

txt_clf_mnb_fpfalse = txt_clf_mnb_fpfalse.fit(train_data, train_target)
predicted = txt_clf_mnb_fpfalse.predict(test_data)
mnb_fpfalse_acc_raw_50 = np.mean(predicted == test_target)

txt_clf_bnb_fpfalse = Pipeline([('vect', CountVectorizer()),
                     ('clf_bnb', BernoulliNB(fit_prior=False))])

txt_clf_bnb_fpfalse = txt_clf_bnb_fpfalse.fit(train_data, train_target)
predicted = txt_clf_bnb_fpfalse.predict(test_data)
bnb_fpfalse_acc_raw_50 = np.mean(predicted == test_target)

In [38]:
print('No Cleanning and Stemming with n=50 (n=50, clean=False, stem=False):', end='\n * ')
print(f"MultinomialNB: {mnb_acc_raw_50}, BernoulliNB: {bnb_acc_raw_50}")
print()
print('No Cleanning and Stemming (n=50) with TF-IDF:', end='\n * ')
print(f"MultinomialNB: {mnb_tfidf_acc_raw_50}, BernoulliNB: {bnb_tfidf_acc_raw_50}")
print()
print('No Cleanning and Stemming (n=50) with fitPrior=False:', end='\n * ')
print(f"MultinomialNB: {mnb_fpfalse_acc_raw_50}, BernoulliNB: {bnb_fpfalse_acc_raw_50}")  

No Cleanning and Stemming with n=50 (n=50, clean=False, stem=False):
 * MultinomialNB: 0.6518119734808833, BernoulliNB: 0.6326205074522706

No Cleanning and Stemming (n=50) with TF-IDF:
 * MultinomialNB: 0.5049100244255023, BernoulliNB: 0.6326205074522706

No Cleanning and Stemming (n=50) with fitPrior=False:
 * MultinomialNB: 0.6645232042271073, BernoulliNB: 0.6474253526743432


In [None]:
### No Cleanning and Stemming with n=250 (n=50, clean=False, stem=False)

In [34]:
# Data Processing

train_novels_raw_250 = make_corpus(training_files, train_files_paths, n=250, clean=False, stem=False)
#print(train_novels)
test_novels_raw_250 = make_corpus(test_files, test_files_paths, n=250, clean=False, stem=False)
#print(test_novels)

# Divide data into train data, train target and test data, test target
train_data = train_novels_raw_250['Words']
train_target = train_novels_raw_250['Author']
test_data= test_novels_raw_250['Words']
test_target = test_novels_raw_250['Author']

# Building pipelines
# Source: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#building-a-pipeline
txt_clf_mnb = Pipeline([('vect', CountVectorizer()),
                     ('clf_mnb', MultinomialNB())])

txt_clf_mnb = txt_clf_mnb.fit(train_data, train_target)
predicted = txt_clf_mnb.predict(test_data)
mnb_acc_raw_250 = np.mean(predicted == test_target)

txt_clf_bnb = Pipeline([('vect', CountVectorizer()),
                     ('clf_bnb', BernoulliNB())])

txt_clf_bnb = txt_clf_bnb.fit(train_data, train_target)
predicted = txt_clf_bnb.predict(test_data)
bnb_acc_raw_250 = np.mean(predicted == test_target)
        
txt_clf_mnb_tfidf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf_mnb', MultinomialNB())])

txt_clf_mnb_tfidf = txt_clf_mnb_tfidf.fit(train_data, train_target)
predicted = txt_clf_mnb_tfidf.predict(test_data)
mnb_tfidf_acc_raw_250 = np.mean(predicted == test_target)

txt_clf_bnb_tfidf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf_bnb', BernoulliNB())])

txt_clf_bnb_tfidf = txt_clf_bnb_tfidf.fit(train_data, train_target)
predicted = txt_clf_bnb_tfidf.predict(test_data)
bnb_tfidf_acc_raw_250 = np.mean(predicted == test_target)

txt_clf_mnb_fpfalse = Pipeline([('vect', CountVectorizer()),
                     ('clf_mnb', MultinomialNB(fit_prior=False))])

txt_clf_mnb_fpfalse = txt_clf_mnb_fpfalse.fit(train_data, train_target)
predicted = txt_clf_mnb_fpfalse.predict(test_data)
mnb_fpfalse_acc_raw_250 = np.mean(predicted == test_target)

txt_clf_bnb_fpfalse = Pipeline([('vect', CountVectorizer()),
                     ('clf_bnb', BernoulliNB(fit_prior=False))])

txt_clf_bnb_fpfalse = txt_clf_bnb_fpfalse.fit(train_data, train_target)
predicted = txt_clf_bnb_fpfalse.predict(test_data)
bnb_fpfalse_acc_raw_250 = np.mean(predicted == test_target)

In [35]:
print('No Cleanning and Stemming with n=250 (n=250, clean=False, stem=False):', end='\n * ')
print(f"MultinomialNB: {mnb_acc_raw_250}, BernoulliNB: {bnb_acc_raw_250}")
print()
print('No Cleanning and Stemming (n=250) with TF-IDF:', end='\n * ')
print(f"MultinomialNB: {mnb_tfidf_acc_raw_250}, BernoulliNB: {bnb_tfidf_acc_raw_250}")
print()
print('No Cleanning and Stemming (n=250) with fitPrior=False:', end='\n * ')
print(f"MultinomialNB: {mnb_fpfalse_acc_raw_250}, BernoulliNB: {bnb_fpfalse_acc_raw_250}")  

No Cleanning and Stemming with n=250 (n=250, clean=False, stem=False):
 * MultinomialNB: 0.7469489414694894, BernoulliNB: 0.6946450809464508

No Cleanning and Stemming (n=250) with TF-IDF:
 * MultinomialNB: 0.5190535491905355, BernoulliNB: 0.6946450809464508

No Cleanning and Stemming (n=250) with fitPrior=False:
 * MultinomialNB: 0.750186799501868, BernoulliNB: 0.701120797011208


In [None]:
### No Cleanning and Stemming with n=1000 (n=1000, clean=False, stem=False)

In [28]:
# Data Processing

train_novels_raw_1000 = make_corpus(training_files, train_files_paths, n=1000, clean=False, stem=False)
#print(train_novels)
test_novels_raw_1000 = make_corpus(test_files, test_files_paths, n=1000, clean=False, stem=False)
#print(test_novels)

# Divide data into train data, train target and test data, test target
train_data = train_novels_raw_1000['Words']
train_target = train_novels_raw_1000['Author']
test_data= test_novels_raw_1000['Words']
test_target = test_novels_raw_1000['Author']

# Create a pipeline
# Source: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#building-a-pipeline
txt_clf_mnb = Pipeline([('vect', CountVectorizer()),
                     ('clf_mnb', MultinomialNB())])

txt_clf_mnb = txt_clf_mnb.fit(train_data, train_target)
predicted = txt_clf_mnb.predict(test_data)
mnb_acc_raw_1000 = np.mean(predicted == test_target)

txt_clf_bnb = Pipeline([('vect', CountVectorizer()),
                     ('clf_bnb', BernoulliNB())])

txt_clf_bnb = txt_clf_bnb.fit(train_data, train_target)
predicted = txt_clf_bnb.predict(test_data)
bnb_acc_raw_1000 = np.mean(predicted == test_target)
        
txt_clf_mnb_tfidf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf_mnb', MultinomialNB())])

txt_clf_mnb_tfidf = txt_clf_mnb_tfidf.fit(train_data, train_target)
predicted = txt_clf_mnb_tfidf.predict(test_data)
mnb_tfidf_acc_raw_1000 = np.mean(predicted == test_target)

txt_clf_bnb_tfidf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf_bnb', BernoulliNB())])

txt_clf_bnb_tfidf = txt_clf_bnb_tfidf.fit(train_data, train_target)
predicted = txt_clf_bnb_tfidf.predict(test_data)
bnb_tfidf_acc_raw_1000 = np.mean(predicted == test_target)

txt_clf_mnb_fpfalse = Pipeline([('vect', CountVectorizer()),
                     ('clf_mnb', MultinomialNB(fit_prior=False))])

txt_clf_mnb_fpfalse = txt_clf_mnb_fpfalse.fit(train_data, train_target)
predicted = txt_clf_mnb_fpfalse.predict(test_data)
mnb_fpfalse_acc_raw_1000 = np.mean(predicted == test_target)

txt_clf_bnb_fpfalse = Pipeline([('vect', CountVectorizer()),
                     ('clf_bnb', BernoulliNB(fit_prior=False))])

txt_clf_bnb_fpfalse = txt_clf_bnb_fpfalse.fit(train_data, train_target)
predicted = txt_clf_bnb_fpfalse.predict(test_data)
bnb_fpfalse_acc_raw_1000 = np.mean(predicted == test_target)

In [41]:
print('No Cleanning and Stemming with n=1000 (n=1000, clean=False, stem=False):', end='\n * ')
print(f"MultinomialNB: {mnb_acc_raw_1000}, BernoulliNB: {bnb_acc_raw_1000}")
print()
print('No Cleanning and Stemming (n=1000) with TF-IDF:', end='\n * ')
print(f"MultinomialNB: {mnb_tfidf_acc_raw_1000}, BernoulliNB: {bnb_tfidf_acc_raw_1000}")
print()
print('No Cleanning and Stemming (n=1000) with fitPrior=False:', end='\n * ')
print(f"MultinomialNB: {mnb_fpfalse_acc_raw_1000}, BernoulliNB: {bnb_fpfalse_acc_raw_1000}")  

No Cleanning and Stemming with n=1000 (n=1000, clean=False, stem=False):
 * MultinomialNB: 0.7514910536779325, BernoulliNB: 0.6719681908548708

No Cleanning and Stemming (n=1000) with TF-IDF:
 * MultinomialNB: 0.3946322067594433, BernoulliNB: 0.6719681908548708

No Cleanning and Stemming (n=1000) with fitPrior=False:
 * MultinomialNB: 0.7514910536779325, BernoulliNB: 0.6719681908548708


In [None]:
### No Cleanning and Stemming with n=5000 (n=5000, clean=False, stem=False)

In [36]:
# Data Processing

train_novels_raw_5000 = make_corpus(training_files, train_files_paths, n=5000, clean=False, stem=False)
#print(train_novels)
test_novels_raw_5000 = make_corpus(test_files, test_files_paths, n=5000, clean=False, stem=False)
#print(test_novels)

# Divide data into train data, train target and test data, test target
train_data = train_novels_raw_5000['Words']
train_target = train_novels_raw_5000['Author']
test_data= test_novels_raw_5000['Words']
test_target = test_novels_raw_5000['Author']

# Create a pipeline
# Source: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#building-a-pipeline
txt_clf_mnb = Pipeline([('vect', CountVectorizer()),
                     ('clf_mnb', MultinomialNB())])

txt_clf_mnb = txt_clf_mnb.fit(train_data, train_target)
predicted = txt_clf_mnb.predict(test_data)
mnb_acc_raw_5000 = np.mean(predicted == test_target)

txt_clf_bnb = Pipeline([('vect', CountVectorizer()),
                     ('clf_bnb', BernoulliNB())])

txt_clf_bnb = txt_clf_bnb.fit(train_data, train_target)
predicted = txt_clf_bnb.predict(test_data)
bnb_acc_raw_5000 = np.mean(predicted == test_target)
        
txt_clf_mnb_tfidf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf_mnb', MultinomialNB())])

txt_clf_mnb_tfidf = txt_clf_mnb_tfidf.fit(train_data, train_target)
predicted = txt_clf_mnb_tfidf.predict(test_data)
mnb_tfidf_acc_raw_5000 = np.mean(predicted == test_target)

txt_clf_bnb_tfidf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf_bnb', BernoulliNB())])

txt_clf_bnb_tfidf = txt_clf_bnb_tfidf.fit(train_data, train_target)
predicted = txt_clf_bnb_tfidf.predict(test_data)
bnb_tfidf_acc_raw_5000 = np.mean(predicted == test_target)

txt_clf_mnb_fpfalse = Pipeline([('vect', CountVectorizer()),
                     ('clf_mnb', MultinomialNB(fit_prior=False))])

txt_clf_mnb_fpfalse = txt_clf_mnb_fpfalse.fit(train_data, train_target)
predicted = txt_clf_mnb_fpfalse.predict(test_data)
mnb_fpfalse_acc_raw_5000 = np.mean(predicted == test_target)

txt_clf_bnb_fpfalse = Pipeline([('vect', CountVectorizer()),
                     ('clf_bnb', BernoulliNB(fit_prior=False))])

txt_clf_bnb_fpfalse = txt_clf_bnb_fpfalse.fit(train_data, train_target)
predicted = txt_clf_bnb_fpfalse.predict(test_data)
bnb_fpfalse_acc_raw_5000 = np.mean(predicted == test_target)

In [40]:
print('No Cleanning and Stemming with n=5000 (n=5000, clean=False, stem=False):', end='\n * ')
print(f"MultinomialNB: {mnb_acc_raw_5000}, BernoulliNB: {bnb_acc_raw_5000}")
print()
print('No Cleanning and Stemming (n=5000) with TF-IDF:', end='\n * ')
print(f"MultinomialNB: {mnb_tfidf_acc_raw_5000}, BernoulliNB: {bnb_tfidf_acc_raw_5000}")
print()
print('No Cleanning and Stemming (n=5000) with fitPrior=False:', end='\n * ')
print(f"MultinomialNB: {mnb_fpfalse_acc_raw_5000}, BernoulliNB: {bnb_fpfalse_acc_raw_5000}")  

No Cleanning and Stemming with n=5000 (n=5000, clean=False, stem=False):
 * MultinomialNB: 0.7339901477832512, BernoulliNB: 0.6305418719211823

No Cleanning and Stemming (n=5000) with TF-IDF:
 * MultinomialNB: 0.21182266009852216, BernoulliNB: 0.6305418719211823

No Cleanning and Stemming (n=5000) with fitPrior=False:
 * MultinomialNB: 0.7389162561576355, BernoulliNB: 0.6305418719211823


### Stemming but No Cleaning (n=500, clean=False, stem=True)

In [31]:
# Data Processing

train_novels_stem = make_corpus(training_files, train_files_paths, n=500, clean=False, stem=True)
#print(train_novels)
test_novels_stem = make_corpus(test_files, test_files_paths, n=500, clean=True, stem=True)
#print(test_novels)

# Divide data into train data, train target and test data, test target
train_data = train_novels_stem['Words']
train_target = train_novels_stem['Author']
test_data= test_novels_stem['Words']
test_target = test_novels_stem['Author']

# Create a pipeline
# Source: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#building-a-pipeline
txt_clf_mnb = Pipeline([('vect', CountVectorizer()),
                     ('clf_mnb', MultinomialNB())])

txt_clf_mnb = txt_clf_mnb.fit(train_data, train_target)
predicted = txt_clf_mnb.predict(test_data)
mnb_acc_stem = np.mean(predicted == test_target)

txt_clf_bnb = Pipeline([('vect', CountVectorizer()),
                     ('clf_bnb', BernoulliNB())])

txt_clf_bnb = txt_clf_bnb.fit(train_data, train_target)
predicted = txt_clf_bnb.predict(test_data)
bnb_acc_stem = np.mean(predicted == test_target)
        
txt_clf_mnb_tfidf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf_mnb', MultinomialNB())])

txt_clf_mnb_tfidf = txt_clf_mnb_tfidf.fit(train_data, train_target)
predicted = txt_clf_mnb_tfidf.predict(test_data)
mnb_tfidf_acc_stem = np.mean(predicted == test_target)

txt_clf_bnb_tfidf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf_bnb', BernoulliNB())])

txt_clf_bnb_tfidf = txt_clf_bnb_tfidf.fit(train_data, train_target)
predicted = txt_clf_bnb_tfidf.predict(test_data)
bnb_tfidf_acc_stem = np.mean(predicted == test_target)

txt_clf_mnb_fpfalse = Pipeline([('vect', CountVectorizer()),
                     ('clf_mnb', MultinomialNB(fit_prior=False))])

txt_clf_mnb_fpfalse = txt_clf_mnb_fpfalse.fit(train_data, train_target)
predicted = txt_clf_mnb_fpfalse.predict(test_data)
mnb_fpfalse_acc_stem = np.mean(predicted == test_target)

txt_clf_bnb_fpfalse = Pipeline([('vect', CountVectorizer()),
                     ('clf_bnb', BernoulliNB(fit_prior=False))])

txt_clf_bnb_fpfalse = txt_clf_bnb_fpfalse.fit(train_data, train_target)
predicted = txt_clf_bnb_fpfalse.predict(test_data)
bnb_fpfalse_acc_stem = np.mean(predicted == test_target)

In [33]:
print('Stemming but No Cleaning (n=500, clean=False, stem=True):', end='\n * ')
print(f"MultinomialNB: {mnb_acc_stem}, BernoulliNB: {bnb_acc_stem}")
print()
print('Stemming but No Cleaning with TF-IDF:', end='\n * ')
print(f"MultinomialNB: {mnb_tfidf_acc_stem}, BernoulliNB: {bnb_tfidf_acc_stem}")
print()
print('Stemming but No Cleaning with fitPrior=False:', end='\n * ')
print(f"MultinomialNB: {mnb_fpfalse_acc_stem}, BernoulliNB: {bnb_fpfalse_acc_stem}")  

Stemming but No Cleaning (n=500, clean=False, stem=True):
 * MultinomialNB: 0.7508455467869222, BernoulliNB: 0.7395715896279594

Stemming but No Cleaning with TF-IDF:
 * MultinomialNB: 0.6257046223224352, BernoulliNB: 0.7395715896279594

Stemming but No Cleaning with fitPrior=False:
 * MultinomialNB: 0.7508455467869222, BernoulliNB: 0.7395715896279594
