In [1]:
from pathlib import Path
import glob
import string
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
import numpy as np
import matplotlib.pyplot as plt

nltk.download('stopwords')
nltk.download('punkt')

stops = stopwords.words('english') + list(string.punctuation) + list(string.whitespace) + list(string.digits)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kivanc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/kivanc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
working_dir = Path("/Users/kivanc/DataMining-ML/Data/text")

train_files_path = os.path.join(working_dir, "training_files", '*.txt')
train_files_names = glob.glob(train_files_path)

for f in train_files_names:
    lines = open(f, 'r').readlines()

    strings = ("***", "<<<<<<<<")
    for i, line in enumerate(lines):
       if any(s in line for s in strings):
            break

    if i < len(lines) - 1:
        with open(f + '_edited', 'w') as f:
            f.write('\n'.join(lines[i + 1:]))


train_files_path = os.path.join(working_dir, "training_files", '*.txt_edited')
train_files_names = glob.glob(train_files_path)

In [53]:
def get_input_list(file_names):
    fnames=[]
    for f in file_names:
        fnames.append(f.rsplit('/', 1)[1])

    author=[]
    for f in file_names:
        temp= f.rsplit('/', 1)[1]
        temp_2=temp.rsplit('-', 1)[0]
        author.append(temp_2.replace('-', ' '))
    
    out_list = [list(x) for x in zip(fnames, author)]
    return out_list


def read_tokenize_clean(file, clean=True, tokenize=True):
    with open(file) as reader:
        text = reader.read().lower()
        if tokenize:
            tokens= word_tokenize(text)
            if clean:
                clean_tokens=[]
                for w in tokens: 
                    if w not in stops:
                        clean_tokens.append(w)
                return clean_tokens
            return tokens
        return text


def reshape_tokens(file, n=500):
    # read, tokenize and clean the file
    tokens= read_tokenize_clean(file, clean=True, tokenize=True)

    # split tokens into n chunks
    tokens_array = np.array_split(np.array(tokens),n)
    
    out_df = np.empty(len(tokens)//n, dtype=object) 
    for i in range(len(tokens)//n):
        out_df[i] = ' '.join(map(str, tokens_array[i])) 
    return out_df


def make_corpus(input_list, n=500):
    out_df=[]
    for [fnames,author] in input_list:
        input_file = os.path.join(working_dir, fnames)
        words = reshape_tokens(input_file, n)
        df = pd.DataFrame(words, columns=['words'])
        df['author'] = author
        out_df.append(df)
    corpus_data = pd.concat(out_df)
    return corpus_data

training_dir = Path("/Users/kivanc/DataMining-ML/Data/text/training_files")
train_files_names = glob.glob(training_dir)

test_dir = Path("/Users/kivanc/DataMining-ML/Data/text/test_files")
test_files_names = glob.glob(test_dir)

training_files = getInputList(train_files_names)
test_files = etInputList(test_files_names)

training_df = make_corpus(training_files, n=500)
print(training_df)

[['\ufeffthe' 'project' 'gutenberg' ... 'mixture' 'notes' 'bird']
 ['burst' 'laughing' 'rose' ... 'happy' 'fond' 'one']
 ['another' 'gone' 'left' ... 'half-open' 'folding-doors' 'behold']
 ...
 ['creating' 'works' 'public' ... 'gutenberg' '”' 'associated']
 ['appearing' 'work' 'must' ... 'agreement' 'liable' 'actual']
 ['direct' 'indirect' 'consequential' ... 'status' 'compliance'
  'particular']]
                                                words           author
0   copyright c 2002 david wyllie metamorphosis fr...      Kafka Franz
1   back lifted head little could see brown belly ...      Kafka Franz
2   thin compared size rest waved helplessly looke...      Kafka Franz
3   walls collection textile samples lay spread ta...      Kafka Franz
4   fur boa sat upright raising heavy fur muff cov...      Kafka Franz
..                                                ...              ...
83  millstone useless necklace afflictive bear yet...  Melville Herman
84  period peeped behind screen

In [64]:
def naive_bayes(n = 500):
    train_df = process_files(train_files, n = n)
    test_df = process_files(test_files, n = n)
    
    X_train= train_df['text']
    targets_train = train_df['author']
    X_test= test_df['text']
    targets_test = test_df['author']
    
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X_train)
    X_new_counts = count_vect.transform(X_test)

    clfMNB = MultinomialNB().fit(X_train_counts, targets_train)
    predicted = clfMNB.predict(X_new_counts)    
    mnb = np.mean(predicted == targets_test)

    clfBNB = BernoulliNB().fit(X_train_counts, targets_train)
    predicted = clfBNB.predict(X_new_counts)    
    bnb = np.mean(predicted == targets_test)
    
    print(f"N={n} MultinomialNB: {mnb}; BernoulliNB: {bnb}")
    print(predicted)

naive_bayes()

N=500 MultinomialNB: 0.5572519083969466; BernoulliNB: 0.5343511450381679
['Herman Melville' 'Herman Melville' 'Herman Melville' 'Herman Melville'
 'Herman Melville' 'Nathaniel Hawthorne' 'Herman Melville'
 'Herman Melville' 'Herman Melville' 'Herman Melville' 'Herman Melville'
 'Herman Melville' 'Herman Melville' 'Herman Melville' 'Herman Melville'
 'Herman Melville' 'Herman Melville' 'Herman Melville' 'Herman Melville'
 'Herman Melville' 'Herman Melville' 'Herman Melville'
 'Nathaniel Hawthorne' 'Herman Melville' 'Herman Melville'
 'Herman Melville' 'Herman Melville' 'Herman Melville' 'Herman Melville'
 'Herman Melville' 'Herman Melville' 'Herman Melville' 'Herman Melville'
 'Herman Melville' 'Herman Melville' 'Herman Melville' 'Herman Melville'
 'Herman Melville' 'Herman Melville' 'Herman Melville' 'Herman Melville'
 'Herman Melville' 'Herman Melville' 'Herman Melville' 'Herman Melville'
 'Herman Melville' 'Herman Melville' 'Herman Melville' 'Herman Melville'
 'Herman Melville' 'Herm