In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import re

import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg, stopwords
from collections import Counter

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\bretw\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [8]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [15]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    
# Load and clean the data.
bible = gutenberg.raw('bible-kjv.txt')
    
bible = text_cleaner(bible)

In [119]:
# Pull the titles of all the books off of wikipedia
bible_wikipedia_df = pd.read_html('https://en.wikipedia.org/wiki/List_of_books_of_the_King_James_Version', header=0)

bible_titles_df = bible_wikipedia_df[0].append(bible_wikipedia_df[2])

print(len(bible_titles_df))
bible_titles_df.reset_index(inplace=True)
bible_titles_df.head()

66


Unnamed: 0,index,King James Bible,Vulgate,Douay Rheims,Full title in the Authorised Version
0,0,Genesis,Genesis,Genesis,"The First Book of Moses, called Genesis"
1,1,Exodus,Exodus,Exodus,"The Second Book of Moses, called Exodus"
2,2,Leviticus,Leviticus,Leviticus,"The Third Book of Moses, called Leviticus"
3,3,Numbers,Numeri,Numbers,"The Fourth Book of Moses, called Numbers"
4,4,Deuteronomy,Deuteronomium,Deuteronomy,"The Fifth Book of Moses, called Deuteronomy"


In [143]:
# Split each book at the beginning of the text
bible_texts = bible.split(' 1:1 ')

# Get rid of the title
bible_texts = bible_texts[1:]

# This will cut out the titles of each book from the end of the text. Mostly accurate.
for i, title in enumerate(bible_titles_df.iloc[1:, -1]):
    to_cut = len(title)
    bible_texts[i] = bible_texts[i][:-to_cut]

In [146]:
# Now we need to cut out all of the passage numbers.
for i, book in enumerate(bible_texts):
    bible_texts[i] = re.sub(r'\d+\:\d+', '', book)


In [148]:
len(bible_texts)

66

In [150]:
nlp = spacy.load('en')

bible_doc = []
for book in bible_texts:
    bible_doc.append(nlp(book))


In [209]:
bible_sents = []
for i, title in enumerate(bible_titles_df.iloc[:, -1]):
    bible_sents.extend([[sent, title] for sent in bible_doc[i].sents])

# Create our dataframe
sentences_all = pd.DataFrame(bible_sents)
sentences_all.head()

Unnamed: 0,0,1
0,"(In, the, beginning, God, created, the, heaven...","The First Book of Moses, called Genesis"
1,"(And, the, earth, was, without, form, ,, and, ...","The First Book of Moses, called Genesis"
2,"(And, the, Spirit, of, God, moved, upon, the, ...","The First Book of Moses, called Genesis"
3,"(And, God, said, ,, Let, there, be, light, :, ...","The First Book of Moses, called Genesis"
4,"(And, God, saw, the, light, ,, that, it, was, ...","The First Book of Moses, called Genesis"


In [305]:
sentences_all.iloc[:, 1].value_counts()

The Book of Psalms                                                            2730
The First Book of Moses, called Genesis                                       1780
The Book of the Prophet Jeremiah                                              1559
The Book of the Prophet Isaiah                                                1535
The Book of the Prophet Ezekiel                                               1356
The Gospel According to St. Luke                                              1315
The Second Book of Moses, called Exodus                                       1301
The Gospel According to St. Matthew                                           1216
The Fourth Book of Moses, called Numbers                                      1211
The Book of Job                                                               1133
The First Book of Samuel, otherwise called the First Book of the Kings        1112
The Gospel According to St. John                                              1100
The 

In [247]:
sentences = pd.DataFrame()

# This loop reduces the size of our dataset by randomly picking 100 sentences from each book
# of the bible. If there are less than 100 books, then all of the books are kept in the dataset
for title in bible_titles_df.iloc[:, -1]:
    subset_by_title = sentences_all[sentences_all.iloc[:, 1] == title]
    subset_idx = subset_by_title.index
    if len(subset_idx) <= 100:
        sentences = sentences.append(subset_by_title)
    else:
        random_indices = np.random.choice(subset_idx, 100, replace=False).tolist()
        sentences = sentences.append(subset_by_title.loc[random_indices, :])
        
sentences.reset_index(inplace=True, drop=True)
print(len(sentences))
sentences.head()

5703


Unnamed: 0,0,1
0,"(Terah, an, hundred, and, nineteen, years, ,, ...","The First Book of Moses, called Genesis"
1,"(therefore, ,, behold, ,, also, his, blood, is...","The First Book of Moses, called Genesis"
2,"(Now, therefore, take, ,, I, pray, thee, ,, th...","The First Book of Moses, called Genesis"
3,"(And, it, came, to, pass, on, the, third, day,...","The First Book of Moses, called Genesis"
4,"(And, the, LORD, said, unto, Noah, ,, Come, th...","The First Book of Moses, called Genesis"


In [250]:
from IPython.display import clear_output

def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(1000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    n = len(df)
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 50 == 0:
            clear_output()
            print("Processing row {}/{}".format(i, n))
            
    return df

# Set up bag the bags.
bible_words = []
for book in bible_doc:
    bible_words.extend(bag_of_words(book))
    
common_words = set(bible_words)

There are a ton of sentences in this dataset, about as many as the Alice and Persuasion datasets. However, we have 7000 features instead of however many those ones had. So, I'm going to take a subset of our data because this is taking far too long to run. I'm not a very patient person unfortunately.

In [251]:
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 5700/5703


Unnamed: 0,oblation,oil,proper,dasheth,sink,gaba,zelophehad,slanderer,saving,copper,...,ulai,drop,uncorruptness,lantern,uz,partly,bridechamber,assuredly,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Terah, an, hundred, and, nineteen, years, ,, ...","The First Book of Moses, called Genesis"
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(therefore, ,, behold, ,, also, his, blood, is...","The First Book of Moses, called Genesis"
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Now, therefore, take, ,, I, pray, thee, ,, th...","The First Book of Moses, called Genesis"
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(And, it, came, to, pass, on, the, third, day,...","The First Book of Moses, called Genesis"
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(And, the, LORD, said, unto, Noah, ,, Come, th...","The First Book of Moses, called Genesis"


In [263]:
type(word_counts.iloc[:, -2][0])

spacy.tokens.span.Span

In [260]:
import pickle
pickle.dump(word_counts.iloc[:, :-2], open('word_counts.pkl', 'wb'))
pickle.dump(word_counts.iloc[:, -2], open('word_counts_sentences.pkl', 'wb'))
pickle.dump(word_counts.ilov[:, -1], open('word_counts_source.pkl', 'wb'))

TypeError: no default __reduce__ due to non-trivial __cinit__

In [165]:
len(bible_sents)

34451

In [276]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split

rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.9856767027185034

Test set score: 0.1691498685363716


In [273]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

mnb = MultinomialNB().fit(X_train, y_train)

# score = f1_score(y_test, bnb.predict(X_test), pos_label='Carroll')
print(mnb.score(X_test, y_test))


0.26292725679228746


# TFIDF and LSA

In [365]:
# This time, download as paragraphs and append lsa to all variables
bible_lsa = gutenberg.paras('bible-kjv.txt')
bible_paras = []
for paragraph in bible_lsa:
    para = paragraph[0]
    bible_paras.append(' '.join(para))

bible_paras = bible_paras[2:]
bible_paras[22]

'1 : 24 And God said , Let the earth bring forth the living creature after his kind , cattle , and creeping thing , and beast of the earth after his kind : and it was so .'

I don't want to get rid of all of the preprocessing steps I did earlier so I'm going to try a different way than the curriculum.

In [368]:
bible_paras = []
for line in sentences_all.iloc[:, 0]:
    word = []
    sentence = []
    for char in line:
        
        if not char.is_punct:
            word.append(str(char))
        else:
            sentence.extend(word)
            sentence.append(str(char))
            word = []
    sentence = ' '.join((sentence))
    bible_paras.append(sentence)
bible_paras[22]

'And God said , Let the waters bring forth abundantly the moving creature that hath life , and fowl that may fly above the earth in the open firmament of heaven .'

In [369]:
len(bible_paras)

34451

In [382]:
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(bible_paras, sentences_all.iloc[:, 1], test_size=0.8, random_state=0)

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )


#Applying the vectorizer
bible_paras_tfidf = vectorizer.fit_transform(bible_paras)
print("Number of features: %d" % bible_paras_tfidf.get_shape()[1])

#splitting into training and test sets
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(bible_paras_tfidf, sentences_all.iloc[:, 1], test_size=0.8, random_state=0)


#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

#Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
print('Original sentence:', X_train[5])
print('Tf_idf vector:', tfidf_bypara[5])

Number of features: 8242
Original sentence: The fear of the LORD is the instruction of wisdom ; and before honour is humility .
Tf_idf vector: {'humility': 0.5723592077667204, 'instruction': 0.4857792434425227, 'wisdom': 0.3698689200583855, 'honour': 0.4001605824061108, 'fear': 0.33679858562859644, 'lord': 0.16142697081617147}


In [383]:
(X_train_lsa.shape)

(6890, 230)

In [384]:
len(X_train)

6890

In [385]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(230)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)
X_test_lsa = lsa.transform(X_test_tfidf)
variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
paras_by_component=pd.DataFrame(X_train_lsa,index=X_train)
for i in range(5):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])

Percent variance captured by all components: 39.35157927550453
Component 0:
And they shall hearken to thy voice : and thou shalt come , thou and the elders of Israel , unto the king of Egypt , and ye shall say unto him , The LORD God of the Hebrews hath met with us : and now let us go , we beseech thee , three days ' journey into the wilderness , that we may sacrifice to the LORD our God .                                                                                                                      0.652141
But thou , O LORD , shall endure for ever ; and thy remembrance unto all generations .                                                                                                                                                                                                                                                                                                                                                            0.648153
And it shall be when the LORD shall br

# Random Forest

In [387]:
rfc_lsa = ensemble.RandomForestClassifier().fit(X_train_lsa, y_train_tfidf)
print('Training score: ', rfc_lsa.score(X_train_lsa, y_train_tfidf))
print('Testing score: ', rfc_lsa.score(X_test_lsa, y_test_tfidf))

Training score:  0.9692307692307692
Testing score:  0.15844853234643155


In [388]:
rfc_tfidf = ensemble.RandomForestClassifier().fit(X_train_tfidf, y_train_tfidf)
print('Training score: ', rfc_tfidf.score(X_train_tfidf, y_train_tfidf))
print('Testing score: ', rfc_tfidf.score(X_test_tfidf, y_test_tfidf))

Training score:  0.9693759071117561
Testing score:  0.263197997169914


# Linear SVM

In [390]:
from sklearn.svm import LinearSVC
lsvm_lsa = LinearSVC().fit(X_train_lsa, y_train_tfidf)
print('Training score: ', lsvm_lsa.score(X_train_lsa, y_train_tfidf))
print('Testing score: ', lsvm_lsa.score(X_test_lsa, y_test_tfidf))

Training score:  0.5031930333817126
Testing score:  0.30742716156888356


In [392]:
lsvm_tfidf = ensemble.RandomForestClassifier().fit(X_train_tfidf, y_train_tfidf)
print('Training score: ', lsvm_tfidf.score(X_train_tfidf, y_train_tfidf))
print('Testing score: ', lsvm_tfidf.score(X_test_tfidf, y_test_tfidf))

Training score:  0.9686502177068215
Testing score:  0.2536192445847393


# Nearest Neighbors

In [393]:
from sklearn.neighbors import KNeighborsClassifier
knn_lsa = KNeighborsClassifier().fit(X_train_lsa, y_train_tfidf)
print('Training score: ', knn_lsa.score(X_train_lsa, y_train_tfidf))
print('Testing score: ', knn_lsa.score(X_test_lsa, y_test_tfidf))

Training score:  0.43149492017416546
Testing score:  0.18794673633032183


In [395]:
from sklearn.neighbors import KNeighborsClassifier
knn_tfidf = KNeighborsClassifier().fit(X_train_tfidf, y_train_tfidf)
print('Training score: ', knn_tfidf.score(X_train_tfidf, y_train_tfidf))
print('Testing score: ', knn_tfidf.score(X_test_tfidf, y_test_tfidf))

Training score:  0.3478955007256894
Testing score:  0.08185479481876565


# Support Vector Classifier

In [399]:
from sklearn.svm import SVC
svc_lsa = SVC().fit(X_train_lsa, y_train_tfidf)
print('Training score: ', svc_lsa.score(X_train_lsa, y_train_tfidf))
print('Testing score: ', svc_lsa.score(X_test_lsa, y_test_tfidf))

Training score:  0.08171262699564587
Testing score:  0.07862559413664236


In [401]:
svc_tfidf = SVC().fit(X_train_lsa, y_train_tfidf)
print('Training score: ', svc_tfidf.score(X_train_lsa, y_train_tfidf))
print('Testing score: ', svc_tfidf.score(X_test_lsa, y_test_tfidf))

Training score:  0.08171262699564587
Testing score:  0.07862559413664236


# Logistic Regression

In [396]:
from sklearn.linear_model import LogisticRegression
lr_lsa = LogisticRegression().fit(X_train_lsa, y_train_tfidf)
print('Training score: ', lr_lsa.score(X_train_lsa, y_train_tfidf))
print('Testing score: ', lr_lsa.score(X_test_lsa, y_test_tfidf))

Training score:  0.4011611030478955
Testing score:  0.29770327636878197


In [397]:
lr_tfidf = LogisticRegression().fit(X_train_tfidf, y_train_tfidf)
print('Training score: ', lr_tfidf.score(X_train_tfidf, y_train_tfidf))
print('Testing score: ', lr_tfidf.score(X_test_tfidf, y_test_tfidf))

Training score:  0.5904208998548621
Testing score:  0.32999528319001487


# Model Selection
The best model was Logistic Regression, with an initial score of 0.32 on the test set. The last part of this assignment is to increase this score by 5%. I will be using RandomizedSearchCV to do so.

In [404]:
# This function reports the top n_top results for a random search
def report(results, n_top=5):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [412]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {'C':np.random.uniform(3.0, 5.0, 1000),
             'penalty':['l1', 'l2']}
n_iter = 50

log_reg_rand = LogisticRegression(class_weight='balanced')
lr_search = RandomizedSearchCV(log_reg_rand,
                               param_distributions=param_dist,
                               n_iter=n_iter, n_jobs=-1)

lr_search.fit(X_train_tfidf, y_train_tfidf)
report(lr_search.cv_results_)



Model with rank: 1
Mean validation score: 0.343 (std: 0.003)
Parameters: {'penalty': 'l2', 'C': 4.871363584092783}

Model with rank: 2
Mean validation score: 0.343 (std: 0.003)
Parameters: {'penalty': 'l2', 'C': 4.833487238433598}

Model with rank: 2
Mean validation score: 0.343 (std: 0.003)
Parameters: {'penalty': 'l2', 'C': 4.814000886572774}

Model with rank: 4
Mean validation score: 0.342 (std: 0.003)
Parameters: {'penalty': 'l2', 'C': 4.801430041487284}

Model with rank: 5
Mean validation score: 0.342 (std: 0.003)
Parameters: {'penalty': 'l2', 'C': 4.714816359662238}



I tried 

:|