In [14]:
# key libs
import numpy as np
import re
import nltk
import pandas as pd
import glob
import codecs

# nlp libs
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 

# processing
from sklearn.model_selection import train_test_split

In [15]:
# set random state
RSEED =0

In [16]:
# get all the filenames into a list
folder_path = "../data/sample_500/"
file_list = glob.glob(folder_path + '*.txt')
len(file_list)

66

In [17]:
# check if files are UTF-8
for filename in file_list:
    try:
        f = codecs.open(filename, encoding='utf-8', errors='strict')
        for line in f:
            pass
        #print ("Valid utf-8")
    except UnicodeDecodeError:
        print (filename,"invalid utf-8")

After checking the codecs on all the files only one was invalid utf-8
../data/gutenberg/Henry David Thoreau___A Week on the Concord and Merrimack Rivers.txt
I have deleted this book.

In [18]:
# code to read file_list.txt and make a data frame with title, author
df = pd.DataFrame (file_list)
df['file_extension'] = df[0]
df[0] = df[0].str.replace('.txt','')
df[0] = df[0].str.replace(folder_path,'')
df['title']=df[0].str.rsplit('_',expand=True,n=1)[1]
df['author']=df[0].str.split('_',expand=True,n=1)[0]
df = df.drop(columns = 0)
df.head()

Unnamed: 0,file_extension,title,author
0,../data/sample_500/Aldous Huxley___Crome Yello...,Crome Yellow,Aldous Huxley
1,../data/sample_500/Andrew Lang___Angling Sketc...,Angling Sketches,Andrew Lang
2,../data/sample_500/Alfred Russel Wallace___Dar...,Darwinism,Alfred Russel Wallace
3,../data/sample_500/Ambrose Bierce___A Son of t...,"A Son of the Gods, and A Horseman in the Sky",Ambrose Bierce
4,../data/sample_500/Baronness Orczy___The Laugh...,The Laughing Cavalier,Baronness Orczy


In [19]:
# split file list into train and test into train and test
train_files, test_files = train_test_split(df, test_size=0.2, random_state=RSEED)
train_files = train_files.reset_index()
test_files = test_files.reset_index()

# save file_lists
train_files.to_csv('../data/train_file_list.csv')
train_files.to_csv('../data/test_file_list.csv')

corpus_train = list(train_files['file_extension'])
corpus_test = list(test_files['file_extension'])

### 1. HELPER FUNCTIONS

Note on stemmimg and lemmatization - The snowball stemmer and porter stemmer seem to be good and very similar. lemmatization doesnt seem to reduce/break down the word as much. lancaster results in unreadable words very often.
Options to try  - snowball and wordnet

In [20]:
### possible tweek change the stemmer and token pattern
# function to custom stem 
from nltk import SnowballStemmer
from nltk import PorterStemmer
from nltk import LancasterStemmer
from nltk.stem import WordNetLemmatizer

# set up key parameters for vectorizer - ngrams - snowball or lemmatizer
n_gram = 1
stem_type = 'lemma' #change this to 'snow' or 'lemma' or 'porter'

snow = SnowballStemmer('english') # change stemmer here
wordnet_lemmatizer = WordNetLemmatizer()
porter = PorterStemmer(mode='NLTK_EXTENSIONS')

def my_tokenizer(doc, tkpat=re.compile('\\b[a-z][a-z][a-z]+\\b')):
    if (stem_type == 'snow'):       
        return [snow.stem(token) for token in tkpat.findall(doc)]
    if (stem_type == 'lemma'):
        return [wordnet_lemmatizer.lemmatize(token) for token in tkpat.findall(doc)]
    if (stem_type == 'porter'):       
        return [porter.stem(token) for token in tkpat.findall(doc)]

In [46]:
def basic_report(v_train,v_test):
    print('-----TRAIN VECTOR INFO ----')
    print(v_train.shape)
    print(type(v_train))
    print('-----TEST VECTOR INFO ----')
    print(v_test.shape)
    print(type(v_test))

In [47]:
def save_to_file(filename,transformed,vectorizer):
    make_df = pd.DataFrame(transformed.toarray(), columns=vectorizer.get_feature_names())
    
    # are we working with train or test vector
    len_vector = make_df.shape[0]
    len_train = train_files.shape[0]
    len_test =test_files.shape[0]
    
    # training vector was passed
    if(len_vector == len_train):
        #print('training')
        make_df.insert(loc=0, column='book_location', value=train_files['file_extension'])
        make_df.insert(loc=0, column='author_name', value=train_files['author'])
        make_df.insert(loc=0, column='book_title', value=train_files['title'])
    
    # testing vector was passed
    if(len_vector == len_test):
        #print('testing')
        make_df.insert(loc=0, column='book_location', value=test_files['file_extension'])
        make_df.insert(loc=0, column='author_name', value=test_files['author'])
        make_df.insert(loc=0, column='book_title', value=test_files['title'])
        
    #print(make_df.head())
    make_df.to_csv('../data/vectors/'+ filename + '.csv',compression='gzip')
    return make_df
    

### 2. BAG OF WORDS -  TERM FREQUENCY
CountVectorizer

In [48]:
# basic Count Vectorizer
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=0.05,ngram_range=(1,n_gram),\
                                analyzer ='word', input = 'filename',\
                                stop_words='english', tokenizer = my_tokenizer)

tf_vectorizer.fit(corpus_train)
tf_vector_train = tf_vectorizer.transform(corpus_train)
tf_vector_test = tf_vectorizer.transform(corpus_test)

In [50]:
import pickle
pickle.dump(tf_vectorizer, open('../data/vectors/tf_vectorizer_'+ stem_type + '_' + str(n_gram) +'gm.pkl','wb'))

In [51]:
basic_report(tf_vector_train,tf_vector_test)

-----TRAIN VECTOR INFO ----
(52, 20252)
<class 'scipy.sparse.csr.csr_matrix'>
-----TEST VECTOR INFO ----
(14, 20252)
<class 'scipy.sparse.csr.csr_matrix'>


In [52]:
# save train output to file and place in panda
tf_train = save_to_file('cv_'+str(n_gram)+'gm_'+stem_type+'_train',tf_vector_train,tf_vectorizer)
tf_train.head()

Unnamed: 0,book_title,author_name,book_location,aaron,aback,abandon,abandoned,abandoning,abandonment,abased,...,zigzag,zone,zool,zoological,zoologist,zoology,zoroaster,zounds,zulu,zur
0,Adventures among Books,Andrew Lang,../data/sample_500/Andrew Lang___Adventures am...,0,0,0,6,0,0,0,...,1,0,0,0,0,0,0,0,2,0
1,Darwinism,Alfred Russel Wallace,../data/sample_500/Alfred Russel Wallace___Dar...,0,0,1,3,0,0,0,...,0,17,12,1,1,1,0,0,1,0
2,The Devil's Dictionary,Ambrose Bierce,../data/sample_500/Ambrose Bierce___The Devil'...,1,0,0,2,1,0,0,...,1,0,0,0,1,3,1,0,0,0
3,A Monk of Fife,Andrew Lang,../data/sample_500/Andrew Lang___A Monk of Fif...,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,"The Collected Works of Ambrose Bierce, Volume 1",Ambrose Bierce,../data/sample_500/Ambrose Bierce___The Collec...,0,0,1,8,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
# save test output to file and place in panda
tf_test = save_to_file ('cv_'+str(n_gram)+'gm_'+stem_type+'_test',tf_vector_test,tf_vectorizer)
tf_test.head()

Unnamed: 0,book_title,author_name,book_location,aaron,aback,abandon,abandoned,abandoning,abandonment,abased,...,zigzag,zone,zool,zoological,zoologist,zoology,zoroaster,zounds,zulu,zur
0,Shapes of Clay,Ambrose Bierce,../data/sample_500/Ambrose Bierce___Shapes of ...,1,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
1,"The Malay Archipelago, Volume 2",Alfred Russel Wallace,../data/sample_500/Alfred Russel Wallace___The...,0,2,0,1,0,0,0,...,0,0,0,6,1,3,0,0,0,0
2,The Mysterious Affair at Styles,Agatha Christie,../data/sample_500/Agatha Christie___The Myste...,0,2,2,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Tailor of Gloucester,Beatrix Potter,../data/sample_500/Beatrix Potter___The Tailor...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"The Collected Works of Ambrose Bierce, Volume 8",Ambrose Bierce,../data/sample_500/Ambrose Bierce___The Collec...,1,0,1,4,1,1,0,...,0,0,0,0,0,0,1,0,0,0


### 3. BAG OF WORDS - INVERSE TERM FREQUENCY
TfidCountVectorizer

In [60]:
# Inverse Count Vectorizer

in_vectorizer = TfidfVectorizer(max_df=0.95, min_df=0.05,ngram_range=(1,1),\
                                analyzer ='word', input = 'filename',\
                                stop_words='english', tokenizer = my_tokenizer)

in_vectorizer.fit(corpus_train)
in_vector_train = in_vectorizer.transform(corpus_train)
in_vector_test = in_vectorizer.transform(corpus_test)

In [55]:
import pickle
pickle.dump(in_vectorizer, open('../data/vectors/tfid_vectorizer_'+ stem_type + '_' + str(n_gram) +'gm.pkl','wb'))

In [56]:
basic_report(in_vector_train,in_vector_test)

-----TRAIN VECTOR INFO ----
(52, 20252)
<class 'scipy.sparse.csr.csr_matrix'>
-----TEST VECTOR INFO ----
(14, 20252)
<class 'scipy.sparse.csr.csr_matrix'>


In [57]:
# save train output to file and place in panda
in_train = save_to_file ('tfid_'+str(n_gram)+'gm_'+stem_type+'_train',in_vector_train,in_vectorizer)
in_train.head()

Unnamed: 0,book_title,author_name,book_location,aaron,aback,abandon,abandoned,abandoning,abandonment,abased,...,zigzag,zone,zool,zoological,zoologist,zoology,zoroaster,zounds,zulu,zur
0,Adventures among Books,Andrew Lang,../data/sample_500/Andrew Lang___Adventures am...,0.0,0.0,0.0,0.008873,0.0,0.0,0.0,...,0.00261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005739,0.0
1,Darwinism,Alfred Russel Wallace,../data/sample_500/Alfred Russel Wallace___Dar...,0.0,0.0,0.000385,0.001055,0.0,0.0,0.0,...,0.0,0.009385,0.008656,0.000649,0.000721,0.000769,0.0,0.0,0.000682,0.0
2,The Devil's Dictionary,Ambrose Bierce,../data/sample_500/Ambrose Bierce___The Devil'...,0.003427,0.0,0.0,0.004049,0.003297,0.0,0.0,...,0.003573,0.0,0.0,0.0,0.004153,0.013287,0.004429,0.0,0.0,0.0
3,A Monk of Fife,Andrew Lang,../data/sample_500/Andrew Lang___A Monk of Fif...,0.0,0.0,0.0,0.000839,0.0,0.0,0.001837,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"The Collected Works of Ambrose Bierce, Volume 1",Ambrose Bierce,../data/sample_500/Ambrose Bierce___The Collec...,0.0,0.0,0.001812,0.013248,0.002697,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
# save test output to file and place in panda
in_test = save_to_file ('tfid_'+str(n_gram)+'gm_'+stem_type+'_test',in_vector_test,in_vectorizer)
in_test.head()

Unnamed: 0,book_title,author_name,book_location,aaron,aback,abandon,abandoned,abandoning,abandonment,abased,...,zigzag,zone,zool,zoological,zoologist,zoology,zoroaster,zounds,zulu,zur
0,Shapes of Clay,Ambrose Bierce,../data/sample_500/Ambrose Bierce___Shapes of ...,0.004006,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.007433,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"The Malay Archipelago, Volume 2",Alfred Russel Wallace,../data/sample_500/Alfred Russel Wallace___The...,0.0,0.002995,0.0,0.000772,0.0,0.0,0.0,...,0.0,0.0,0.0,0.008548,0.001583,0.005065,0.0,0.0,0.0,0.0
2,The Mysterious Affair at Styles,Agatha Christie,../data/sample_500/Agatha Christie___The Myste...,0.0,0.005366,0.003026,0.005531,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,The Tailor of Gloucester,Beatrix Potter,../data/sample_500/Beatrix Potter___The Tailor...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"The Collected Works of Ambrose Bierce, Volume 8",Ambrose Bierce,../data/sample_500/Ambrose Bierce___The Collec...,0.003143,0.0,0.002032,0.007427,0.003024,0.00381,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.004063,0.0,0.0,0.0


### 4. WORD2VEC
comes in here

In [None]:
### 5. 