In [1]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter

%matplotlib inline

## Supervised NLP
- Requires pre-labelled dataset for training/testing
- Mostly used for categorizing text
- Use any supervised model that allows for categorical outcomes
- **Bag of Words:** feature generation approach
    - Count how many times each word appears for each sentence
    - Use counts as features

In [7]:
#text cleaning function
def text_cleaner(text):
    # spaCy does not recognize '--', replace with blank string
    text = re.sub(r'--',' ',text)
    text = re.sub('[\[].*?[\]]','',text)
    text = ' '.join(text.split())
    return text

#load and clean data
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

#chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER.*', '', alice)

alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

In [8]:
alice[:500]

"Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?' So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of gettin"

In [9]:
#parse cleaned novels
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [10]:
#group into sentences
alice_sents = [[sent, 'Carroll'] for sent in alice_doc.sents]
persuasion_sents = [[sent, 'Austen'] for sent in persuasion_doc.sents]

#combine sentences from novels into single df
sentences = pd.DataFrame(alice_sents + persuasion_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(I, shall, be, late, !, ')",Carroll


In [11]:
len(sentences)

5318

In [5]:
#BoW, exclude stopwords & punctuation, use lemmas, 2000 most common words
def bag_of_words(text):
    
    #filter punct and stopwords
    allwords = [token.lemma_ for token in text 
                if not token.is_punct and not token.is_stop]
    
    #return most common words
    return [item[0] for item in Counter(allwords).most_common(2000)]

#creates a dataframe with features for each word in common word set
#values are count of times word appears in each sentence
def bow_features(sentences, common_words):
    
    #set df and initialize counts
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    #process each row, counting word occurences
    for i, sentence in enumerate(df['text_sentence']):
        
        #convert sentence to lemmas & filter punct, stops, & uncommon words
        words = [token.lemma_ for token in sentence
                 if (not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words)]
        
        #populate row with word counts
        for word in words:
            df.loc[i, word] += 1
        
        #counter to make sure kernal isn't hanging
        if i % 100 == 0:
            print('processing row {}'.format(i))
    
    return df

#set up bags
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)

#combine bags to create set of unique words
common_words = set(alicewords + persuasionwords)

In [6]:
#create df with features, this takes forever thankfully someone uploaded their completed file for me
#word_counts = bow_features(sentences, common_words)

word_counts = pd.read_csv('bow_features_alice_persuasion')
word_counts.head()

Unnamed: 0,mortification,violently,class,pause,apartment,domestic,legged,spread,correct,had,...,conqueror,upper,capital,roar,gather,calm,tunnel,tooth,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Alice was beginning to get very tired of sitti...,Carroll
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,So she was considering in her own mind (as wel...,Carroll
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,There was nothing so VERY remarkable in that; ...,Carroll
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Oh dear!,Carroll
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,I shall be late!',Carroll


In [7]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split
import time

rfc = ensemble.RandomForestClassifier()
y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=0.4,
                                                    random_state=0)
start_time = time.clock()
train = rfc.fit(X_train, y_train)
print('train score: {}'.format(rfc.score(X_train, y_train)))
print('test score: {}'.format(rfc.score(X_test, y_test)))
print('{} seconds'.format(time.clock() - start_time))

train score: 0.9855799373040752
test score: 0.8825187969924813
0.6346969999999956 seconds


**Result:** overfitting, common problem with BoW and RF, try logistic regression

In [8]:
from sklearn.linear_model import LogisticRegression

start_time = time.clock()
lr = LogisticRegression().fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('train score: {}'.format(lr.score(X_train, y_train)))
print('test score: {}'.format(lr.score(X_test, y_test)))
print('{} seconds'.format(time.clock() - start_time))

(3190, 3062) (3190,)
train score: 0.9579937304075236
test score: 0.9158834586466166
0.19104199999999594 seconds


In [9]:
start_time = time.clock()
gbc = ensemble.GradientBoostingClassifier().fit(X_train, y_train)
print('train score: {}'.format(gbc.score(X_train, y_train)))
print('test score: {}'.format(gbc.score(X_test, y_test)))
print('{} seconds'.format(time.clock() - start_time))

train score: 0.8846394984326019
test score: 0.8735902255639098
34.904721 seconds


## Same model, new inputs

See if it can distinguish authors using a different sample of Austen's work, Emma. First need to load and process Emma the same way as earlier

In [None]:
#clean
emma = gutenberg.raw('austen-emma.txt')
emma = re.sub(r'VOLUME \w+', '', emma)
emma = re.sub(r'CHAPTER \w+', '', emma)
emma = text_cleaner(emma)
print(emma[:100])

In [None]:
#parse
emma_doc = nlp(emma)

In [None]:
#group into sentences
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]
emma_sents = [[sent, "Austen"] for sent in emma_doc.sents]

#set to same length as alice (emma is longer)
emma_sents = emma_sents[0:len(alice_sents)]

In [None]:
emma_sentences = pd.DataFrame(emma_sents)
emma_bow = bow_features(emma_sentences, common_words)

In [None]:
#model with LR

#combine emma and alice
X_Emma_test = np.concatenate((
    X_train[y_train[y_train=='Carroll'].index],
    emma_bow.drop(['text_sentence','text_source'],1)),
    axis=0)

y_Emma_test = pd.concat(
    [y_train[y_train=='Carroll'],
     pd.Series(['Austen'] * emma_bow.shape[0])])

print('test score: {}'.format(lr.score(X_Emma_test, y_Emma_test)))
lr_Emma_predicted = lr.predict(X_Emma_test)
pd.crosstab(y_Emma_test, lr_Emma_predicted)

## Challenge 0
Improve performance: try other models, adjust spacy params, features, etc

In [11]:
#try gradient boosting I've had better results with
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

start_time = time.clock()
xgb_clf = XGBClassifier(n_estimators=1000,
                        learning_rate=0.05,).fit(X_train, y_train)

#this takes too long
#params = [{'max_depth':[2, 3, 5, 10],
#           'learning_rate':[0.1, 0.05, 0.01],
#           'subsample':[0.25, 0.5, 0.75, 1]}]

#grid = GridSearchCV(estimator=xgb_clf, param_grid=params).fit(X_train, y_train)
#print('params:', grid.best_params_)

#xgb_clf = XGBClassifier(
#    max_depth=grid.best_params_.get('max_depth'),
#    learning_rate=grid.best_params_.get('learning_rate'),
#    subsample=grid.best_params_.get('subsample')
#).fit(X_train, y_train)

print('train score: {}'.format(xgb_clf.score(X_train, y_train)))
print('test score: {}'.format(xgb_clf.score(X_test, y_test)))
print('{} seconds'.format(time.clock() - start_time))

train score: 0.9156739811912226
test score: 0.8947368421052632
130.015896 seconds


## Challenge 1
Pull a new book and see if model can identify it vs alice

In [27]:
mb = gutenberg.raw('shakespeare-macbeth.txt')

mb = re.sub(r'VOLUME \w+', '', mb)
mb = re.sub(r'CHAPTER \w+', '', mb)
mb = text_cleaner(mb)
print(mb[:100])

mb_doc = nlp(mb)

Actus Primus. Scoena Prima. Thunder and Lightning. Enter three Witches. 1. When shall we three meet 


In [28]:
mb_sents = [[sent, 'Shakesspeare'] for sent in mb_doc.sents]
mb_sents = mb_sents[0:len(alice_sents)]

In [29]:
mb_sentences = pd.DataFrame(mb_sents)
mb_bow = bow_features(mb_sentences, common_words)

processing row 0
processing row 100
processing row 200
processing row 300
processing row 400
processing row 500
processing row 600
processing row 700
processing row 800
processing row 900
processing row 1000
processing row 1100
processing row 1200
processing row 1300
processing row 1400
processing row 1500
processing row 1600


In [33]:
X_mb = mb_bow.drop(['text_sentence', 'text_source'], 1)
y_mb = mb_bow.text_source

alice_wc = word_counts[word_counts.text_source == 'Carroll']
X_alice = alice_wc.drop(['text_sentence', 'text_source'], 1)
y_alice = alice_wc.text_source

X = pd.concat([X_mb, X_alice], 0)
y = pd.concat([y_mb, y_alice], 0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

lr = LogisticRegression().fit(X_train, y_train)
print('train set: {}'.format(lr.score(X_train, y_train)))
print('test set: {}'.format(lr.score(X_test, y_test)))
lr_mb_predicted = lr.predict(X_test)
pd.crosstab(y_test, lr_mb_predicted)

train set: 0.9415584415584416
test set: 0.8779940119760479


col_0,Carroll,Shakesspeare
text_source,Unnamed: 1_level_1,Unnamed: 2_level_1
Carroll,510,137
Shakesspeare,26,663
