In [18]:
# key libs
import numpy as np
import re
import nltk
import pandas as pd
import glob
import codecs

# nlp libs
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 

# processing
from sklearn.model_selection import train_test_split

In [19]:
# set random state
RSEED =0

In [20]:
# get all the filenames into a list
folder_path = "../data/gutenberg/"
file_list = glob.glob(folder_path + '*.txt')
len(file_list)

3035

In [21]:
# check if files are UTF-8
check_encoding =False
if(check_encoding == True):
    for filename in file_list:
        try:
            f = codecs.open(filename, encoding='utf-8', errors='strict')
            for line in f:
                pass
            #print ("Valid utf-8")
        except UnicodeDecodeError:
            print (filename,"invalid utf-8")

After checking the codecs on all the files only one was invalid utf-8
../data/gutenberg/Henry David Thoreau___A Week on the Concord and Merrimack Rivers.txt
I have deleted this book.

In [5]:
# code to read file_list.txt and make a data frame with title, author
df = pd.DataFrame (file_list)
df['file_extension'] = df[0]
df[0] = df[0].str.replace('.txt','')
df[0] = df[0].str.replace(folder_path,'')
df['title']=df[0].str.rsplit('_',expand=True,n=1)[1]
df['author']=df[0].str.split('_',expand=True,n=1)[0]
df = df.drop(columns = 0)
df.head()

Unnamed: 0,file_extension,title,author
0,../data/gutenberg/John Stuart Mill___Principle...,Principles Of Political Economy,John Stuart Mill
1,../data/gutenberg/Robert Louis Stevenson___The...,The Works of Robert Louis Stevenson - Swanston...,Robert Louis Stevenson
2,../data/gutenberg/R M Ballantyne___Shifting Wi...,Shifting Winds,R M Ballantyne
3,../data/gutenberg/Bret Harte___Her Letter His ...,Her Letter His Answer & Her Last Letter,Bret Harte
4,../data/gutenberg/Robert Louis Stevenson___The...,The Works of Robert Louis Stevenson - Swanston...,Robert Louis Stevenson


In [6]:
# split file list into train and test into train and test
train_files, test_files = train_test_split(df, test_size=0.2, random_state=RSEED)
train_files = train_files.reset_index()
test_files = test_files.reset_index()

# save file_lists
train_files.to_csv('../data/train_file_list.csv')
test_files.to_csv('../data/test_file_list.csv')

corpus_train = list(train_files['file_extension'])
corpus_test = list(test_files['file_extension'])

In [7]:
train_files.head()

Unnamed: 0,index,file_extension,title,author
0,779,../data/gutenberg/Stephen Leacock___Behind the...,Behind the Beyond,Stephen Leacock
1,1357,../data/gutenberg/Jerome Klapka Jerome___Tommy...,Tommy and Co,Jerome Klapka Jerome
2,2868,../data/gutenberg/Stephen Leacock___Winsome Wi...,Winsome Winnie and other New Nonsense Novels,Stephen Leacock
3,175,../data/gutenberg/Hamlin Garland___The Moccasi...,The Moccasin Ranch,Hamlin Garland
4,121,../data/gutenberg/Charles Dickens___Three Ghos...,Three Ghost Stories,Charles Dickens


### 1. HELPER FUNCTIONS

Note on stemmimg and lemmatization - The snowball stemmer and porter stemmer seem to be good and very similar. lemmatization doesnt seem to reduce/break down the word as much. lancaster results in unreadable words very often.
Options to try  - snowball and wordnet

In [8]:
### possible tweek change the stemmer and token pattern
# function to custom stem 
from nltk import SnowballStemmer
from nltk import PorterStemmer
from nltk import LancasterStemmer
from nltk.stem import WordNetLemmatizer

# set up key parameters for vectorizer - ngrams - snowball or lemmatizer
n_gram = 1
stem_type = 'lemma' #change this to 'snow' or 'lemma' or 'porter'

snow = SnowballStemmer('english') # change stemmer here
wordnet_lemmatizer = WordNetLemmatizer()
porter = PorterStemmer(mode='NLTK_EXTENSIONS')

def my_tokenizer(doc, tkpat=re.compile('\\b[a-z][a-z][a-z]+\\b')):
    if (stem_type == 'snow'):       
        return [snow.stem(token) for token in tkpat.findall(doc)]
    if (stem_type == 'lemma'):
        return [wordnet_lemmatizer.lemmatize(token) for token in tkpat.findall(doc)]
    if (stem_type == 'porter'):       
        return [porter.stem(token) for token in tkpat.findall(doc)]

In [9]:
def basic_report(v_train,v_test):
    print('-----TRAIN VECTOR INFO ----')
    print(v_train.shape)
    print(type(v_train))
    print('-----TEST VECTOR INFO ----')
    print(v_test.shape)
    print(type(v_test))

In [10]:
def save_to_file(filename,transformed,vectorizer):
    make_df = pd.DataFrame(transformed.toarray(), columns=vectorizer.get_feature_names())
    
    # are we working with train or test vector
    len_vector = make_df.shape[0]
    len_train = train_files.shape[0]
    len_test =test_files.shape[0]
    
    # training vector was passed
    if(len_vector == len_train):
        #print('training')
        make_df.insert(loc=0, column='book_location', value=train_files['file_extension'])
        make_df.insert(loc=0, column='author_name', value=train_files['author'])
        make_df.insert(loc=0, column='book_title', value=train_files['title'])
    
    # testing vector was passed
    if(len_vector == len_test):
        #print('testing')
        make_df.insert(loc=0, column='book_location', value=test_files['file_extension'])
        make_df.insert(loc=0, column='author_name', value=test_files['author'])
        make_df.insert(loc=0, column='book_title', value=test_files['title'])
        
    #print(make_df.head())
    make_df.to_csv('../data/vectors/'+ filename + '.csv',compression='gzip')
    return make_df
    

### 2. BAG OF WORDS -  TERM FREQUENCY
CountVectorizer

In [16]:
# basic Count Vectorizer
tf_vectorizer = CountVectorizer(max_df=0.90, min_df=0.05,ngram_range=(1,n_gram),\
                                analyzer ='word', input = 'filename',\
                                stop_words='english', tokenizer = my_tokenizer)

tf_vectorizer.fit(corpus_train)
tf_vector_train = tf_vectorizer.transform(corpus_train)
tf_vector_test = tf_vectorizer.transform(corpus_test)

In [17]:
import dill 
dill.dump(tf_vectorizer, open('../data/vectors/tf_vectorizer_'+ stem_type + '_' + str(n_gram) +'gm', 'wb'))

In [22]:
basic_report(tf_vector_train,tf_vector_test)

-----TRAIN VECTOR INFO ----
(2428, 18425)
<class 'scipy.sparse.csr.csr_matrix'>
-----TEST VECTOR INFO ----
(607, 18425)
<class 'scipy.sparse.csr.csr_matrix'>


In [23]:
# save train output to file and place in panda
tf_train = save_to_file('cv_'+str(n_gram)+'gm_'+stem_type+'_train',tf_vector_train,tf_vectorizer)
tf_train.head()

Unnamed: 0,book_title,author_name,book_location,aaron,aback,abandon,abandoned,abandoning,abandonment,abasement,...,zealand,zealous,zealously,zenith,zephyr,zero,zest,zigzag,zone,zoological
0,Behind the Beyond,Stephen Leacock,../data/gutenberg/Stephen Leacock___Behind the...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Tommy and Co,Jerome Klapka Jerome,../data/gutenberg/Jerome Klapka Jerome___Tommy...,0,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,Winsome Winnie and other New Nonsense Novels,Stephen Leacock,../data/gutenberg/Stephen Leacock___Winsome Wi...,0,0,2,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,The Moccasin Ranch,Hamlin Garland,../data/gutenberg/Hamlin Garland___The Moccasi...,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,Three Ghost Stories,Charles Dickens,../data/gutenberg/Charles Dickens___Three Ghos...,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,3,0,0


In [24]:
# save test output to file and place in panda
tf_test = save_to_file ('cv_'+str(n_gram)+'gm_'+stem_type+'_test',tf_vector_test,tf_vectorizer)
tf_test.head()

Unnamed: 0,book_title,author_name,book_location,aaron,aback,abandon,abandoned,abandoning,abandonment,abasement,...,zealand,zealous,zealously,zenith,zephyr,zero,zest,zigzag,zone,zoological
0,Great Catherine,George Bernard Shaw,../data/gutenberg/George Bernard Shaw___Great ...,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Short Cruises,William Wymark Jacobs,../data/gutenberg/William Wymark Jacobs___Shor...,0,3,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,Richard Dare's Venture,Edward Stratemeyer,../data/gutenberg/Edward Stratemeyer___Richard...,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,William Harvey And The Discovery Of The Circul...,Thomas Henry Huxley,../data/gutenberg/Thomas Henry Huxley___Willia...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Beyond,John Galsworthy,../data/gutenberg/John Galsworthy___Beyond.txt,0,0,1,1,1,3,1,...,0,0,0,1,0,0,0,0,0,0


### 3. BAG OF WORDS - INVERSE TERM FREQUENCY
TfidCountVectorizer

In [10]:
# Inverse Count Vectorizer

in_vectorizer = TfidfVectorizer(max_df=0.85, min_df=0.05,ngram_range=(1,1),\
                                analyzer ='word', input = 'filename',\
                                stop_words='english', tokenizer = my_tokenizer)

in_vectorizer.fit(corpus_train)
in_vector_train = in_vectorizer.transform(corpus_train)
in_vector_test = in_vectorizer.transform(corpus_test)

In [11]:
import dill 
dill.dump(in_vectorizer, open('../data/vectors/tfid_vectorizer_'+ stem_type + '_' + str(n_gram) +'gm','wb'))

In [12]:
basic_report(in_vector_train,in_vector_test)

-----TRAIN VECTOR INFO ----
(2428, 18274)
<class 'scipy.sparse.csr.csr_matrix'>
-----TEST VECTOR INFO ----
(607, 18274)
<class 'scipy.sparse.csr.csr_matrix'>


In [13]:
# save train output to file and place in panda
in_train = save_to_file ('tfid_'+str(n_gram)+'gm_'+stem_type+'_train',in_vector_train,in_vectorizer)
in_train.head()

Unnamed: 0,book_title,author_name,book_location,aaron,aback,abandon,abandoned,abandoning,abandonment,abasement,...,zealand,zealous,zealously,zenith,zephyr,zero,zest,zigzag,zone,zoological
0,Behind the Beyond,Stephen Leacock,../data/gutenberg/Stephen Leacock___Behind the...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Tommy and Co,Jerome Klapka Jerome,../data/gutenberg/Jerome Klapka Jerome___Tommy...,0.0,0.0,0.004126,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005299
2,Winsome Winnie and other New Nonsense Novels,Stephen Leacock,../data/gutenberg/Stephen Leacock___Winsome Wi...,0.0,0.0,0.006395,0.002774,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00448,0.0,0.0,0.0
3,The Moccasin Ranch,Hamlin Garland,../data/gutenberg/Hamlin Garland___The Moccasi...,0.0,0.0,0.0,0.0,0.0,0.0,0.005872,...,0.0,0.0,0.0,0.0,0.0,0.00554,0.0,0.0,0.0,0.0
4,Three Ghost Stories,Charles Dickens,../data/gutenberg/Charles Dickens___Three Ghos...,0.0,0.0,0.0,0.013355,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03942,0.0,0.0


In [14]:
# save test output to file and place in panda
in_test = save_to_file ('tfid_'+str(n_gram)+'gm_'+stem_type+'_test',in_vector_test,in_vectorizer)
in_test.head()

Unnamed: 0,book_title,author_name,book_location,aaron,aback,abandon,abandoned,abandoning,abandonment,abasement,...,zealand,zealous,zealously,zenith,zephyr,zero,zest,zigzag,zone,zoological
0,Great Catherine,George Bernard Shaw,../data/gutenberg/George Bernard Shaw___Great ...,0.0,0.005151,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Short Cruises,William Wymark Jacobs,../data/gutenberg/William Wymark Jacobs___Shor...,0.0,0.004831,0.001115,0.0,0.0,0.0,0.0,...,0.001985,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Richard Dare's Venture,Edward Stratemeyer,../data/gutenberg/Edward Stratemeyer___Richard...,0.0,0.003514,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,William Harvey And The Discovery Of The Circul...,Thomas Henry Huxley,../data/gutenberg/Thomas Henry Huxley___Willia...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Beyond,John Galsworthy,../data/gutenberg/John Galsworthy___Beyond.txt,0.0,0.0,0.001765,0.001531,0.002514,0.007873,0.003384,...,0.0,0.0,0.0,0.002998,0.0,0.0,0.0,0.0,0.0,0.0


### 4. WORD2VEC
comes in here

In [None]:
### 5. 