In [1]:
import re
import scipy
import spacy
import sklearn
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import ensemble
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from nltk.corpus import gutenberg, stopwords
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def text_cleaner(text):
    text = re.sub(r'--',' ',text)    
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub(r'Chapter \d+','',text)
    text = re.sub(r'CHAPTER \d+', '', text)
    text = re.sub("\\n\\n.*?\\n\\n", '', text)
    text = ' '.join(text.split())
    return text

emma = gutenberg.raw('austen-emma.txt')
persuasion = gutenberg.raw('austen-persuasion.txt')
sense = gutenberg.raw('austen-sense.txt')

emma = text_cleaner(emma)
persuasion = text_cleaner(persuasion)
sense = text_cleaner(sense)

nlp = spacy.load('en')
emma_doc = nlp(emma)
persuasion_doc = nlp(persuasion)
sense_doc = nlp(sense)

In [3]:
persuasion_sents = []
emma_sents = []
sense_sents = []
for sentence in persuasion_doc.sents:
    persuasion_sents.append(sentence)
for sentence in emma_doc.sents:
    emma_sents.append(sentence)
for sentence in sense_doc.sents:
    sense_sents.append(sentence)

In [4]:
persuasion_sents = persuasion_sents[:1000]
emma_sents = emma_sents[:1000]
sense_sents = sense_sents[:1000]

In [5]:
emma_df = [[sent, "emma"] for sent in emma_sents]
persuasion_df = [[sent, "persuasion"] for sent in persuasion_sents]
sense_df = [[sent, "sense"] for sent in sense_sents]
sentences = pd.DataFrame(emma_df + persuasion_df + sense_df)

In [7]:
def bag_of_words(text):    
    allwords = [token.lemma_ for token in text if not token.is_punct and not token.is_stop]
    return [item[0] for item in Counter(allwords).most_common(750)]
def remove_missing_words(words, sentences):
    text = ''
    for token in sentences:
        text += token.text
    for word in words:
        if word not in text:
            words.remove(word)
    return words
def bow_features(sentences, common_words):
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0

    for i, sentence in enumerate(df['text_sentence']):
        if i % 250 == 0:
            print("Processing row {}".format(i))
        words = [token.lemma_
                 for token in sentence
                 if (not token.is_punct and not token.is_stop and token.lemma_ in common_words)]
        for word in words:
            df.loc[i, word] += 1
    
    return df

In [8]:
emma_words = bag_of_words(emma_doc)
sense_words = bag_of_words(sense_doc)
persuasion_words = bag_of_words(persuasion_doc)

emma_words = remove_missing_words(emma_words, emma_sents)
persuasion_words = remove_missing_words(persuasion_words, persuasion_sents)
sense_words = remove_missing_words(sense_words, sense_sents)

common_words = set(emma_words + sense_words + persuasion_words)


In [9]:
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 250
Processing row 500
Processing row 750
Processing row 1000
Processing row 1250
Processing row 1500
Processing row 1750
Processing row 2000
Processing row 2250
Processing row 2500
Processing row 2750


Unnamed: 0,delight,effect,scarcely,word,bless,arrive,look,fortune,longer,servant,...,help,mean,light,wise,scheme,idea,thought,Dashwood,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,(CHAPTER),emma
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(I, Emma, Woodhouse, ,, handsome, ,, clever, ,...",emma
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(She, was, the, youngest, of, the, two, daught...",emma
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Her, mother, had, died, too, long, ago, for, ...",emma
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Sixteen, years, had, Miss, Taylor, been, in, ...",emma


In [10]:
rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0)
train = rfc.fit(X_train, y_train)
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))



Training set score: 0.8738888888888889

Test set score: 0.6441666666666667


In [11]:
cross_val_score(rfc, X, Y, cv=5)

array([0.51833333, 0.555     , 0.60833333, 0.60333333, 0.55333333])

In [12]:
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(1800, 1028) (1800,)
Training set score: 0.8327777777777777

Test set score: 0.6441666666666667




In [13]:
cross_val_score(lr, X, Y, cv=5)



array([0.56333333, 0.61333333, 0.62333333, 0.6       , 0.575     ])

In [14]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.6994444444444444

Test set score: 0.6175


In [15]:
cross_val_score(clf, X, Y, cv=5)

array([0.545     , 0.59      , 0.61166667, 0.59666667, 0.565     ])

In [16]:
def document_freq(data, sentences, common_words, doc_names, doc_words):    
    df = pd.DataFrame(columns = common_words)
    df.iloc[:, 0] = [0, 0, 0, 0, 0, 0]
    df.loc[:, common_words] = 0
    df.rename(index={0:'df', 1:'cf', 2:'idf', 3:'emma', 4:'persuasion', 5:'sense'}, inplace=True)
    
    for word in common_words:
        df.loc['df', word] = data[data[word] > 0][word].count()
        df.loc['cf', word] = data.loc[:, word].sum()
        df.loc['idf', word] = np.log2(len(sentences)/df.loc['df', word])
    for word in df.columns:
        for i in range(len(doc_names)):
            if word in doc_words[i]:
                df.loc[doc_names[i], word] = df.loc['idf', word]
    return df

In [17]:
doc_names = ['emma', 'persuasion', 'sense']
doc_words = [emma_words, persuasion_words, sense_words]
tf_idf = document_freq(word_counts, sentences, common_words, doc_names, doc_words)
tf_idf

  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,delight,effect,scarcely,word,bless,arrive,look,fortune,longer,servant,...,Kellynch,single,help,mean,light,wise,scheme,idea,thought,Dashwood
df,20.0,8.0,12.0,28.0,7.0,6.0,61.0,33.0,24.0,13.0,...,40.0,11.0,18.0,53.0,4.0,5.0,6.0,37.0,24.0,64.0
cf,20.0,8.0,12.0,28.0,7.0,6.0,62.0,35.0,24.0,14.0,...,42.0,11.0,18.0,54.0,4.0,5.0,6.0,37.0,24.0,65.0
idf,7.228819,8.550747,7.965784,6.743392,8.743392,8.965784,5.620009,6.506353,6.965784,7.850307,...,6.228819,8.091315,7.380822,5.822826,9.550747,9.228819,8.965784,6.341293,6.965784,5.550747
emma,7.228819,8.550747,7.965784,6.743392,0.0,0.0,5.620009,6.506353,6.965784,7.850307,...,0.0,8.091315,7.380822,5.822826,9.550747,0.0,8.965784,6.341293,6.965784,0.0
persuasion,7.228819,8.550747,7.965784,6.743392,8.743392,8.965784,5.620009,6.506353,6.965784,7.850307,...,6.228819,0.0,7.380822,5.822826,0.0,9.228819,8.965784,6.341293,6.965784,0.0
sense,7.228819,8.550747,7.965784,6.743392,0.0,8.965784,5.620009,6.506353,6.965784,7.850307,...,0.0,0.0,7.380822,5.822826,9.550747,0.0,0.0,6.341293,6.965784,5.550747


In [18]:
tf_idf = tf_idf.T
tf_idf.head()

Unnamed: 0,df,cf,idf,emma,persuasion,sense
delight,20.0,20.0,7.228819,7.228819,7.228819,7.228819
effect,8.0,8.0,8.550747,8.550747,8.550747,8.550747
scarcely,12.0,12.0,7.965784,7.965784,7.965784,7.965784
word,28.0,28.0,6.743392,6.743392,6.743392,6.743392
bless,7.0,7.0,8.743392,0.0,8.743392,0.0


In [19]:
threshold = 5
tf_idf['emma_threshold'] = 0
tf_idf['persuasion_threshold'] = 0
tf_idf['sense_threshold'] = 0

tf_idf['emma_threshold'] = np.where(tf_idf['emma'] > threshold, 1, 0)
tf_idf['persuasion_threshold'] = np.where(tf_idf['persuasion'] > threshold, 1, 0)
tf_idf['sense_threshold'] = np.where(tf_idf['sense'] > threshold, 1, 0)

tf_idf.head()

Unnamed: 0,df,cf,idf,emma,persuasion,sense,emma_threshold,persuasion_threshold,sense_threshold
delight,20.0,20.0,7.228819,7.228819,7.228819,7.228819,1,1,1
effect,8.0,8.0,8.550747,8.550747,8.550747,8.550747,1,1,1
scarcely,12.0,12.0,7.965784,7.965784,7.965784,7.965784,1,1,1
word,28.0,28.0,6.743392,6.743392,6.743392,6.743392,1,1,1
bless,7.0,7.0,8.743392,0.0,8.743392,0.0,0,1,0


In [20]:
tf_idf.drop(['df', 'cf', 'idf', 'emma', 'persuasion', 'sense'], axis=1, inplace=True)
tf_idf.head()

Unnamed: 0,emma_threshold,persuasion_threshold,sense_threshold
delight,1,1,1
effect,1,1,1
scarcely,1,1,1
word,1,1,1
bless,0,1,0


In [21]:
tf_idf['source'] = 'multiple'
tf_idf.head()

Unnamed: 0,emma_threshold,persuasion_threshold,sense_threshold,source
delight,1,1,1,multiple
effect,1,1,1,multiple
scarcely,1,1,1,multiple
word,1,1,1,multiple
bless,0,1,0,multiple


In [22]:
def set_doc(df):
    for i in range(len(df)):
        flag = 0
        set_source = 'multiple'
        if (df.iloc[i, 0] == 1):
            flag = 1
            set_source = 'emma'
        if (df.iloc[i, 1] == 1):
            if (flag == 1): 
                continue
            flag = 1
            set_source = 'persuasion'
        if df.iloc[i, 2] == 1:
            if flag == 1:
                continue
            flag = 1
            set_source = 'sense'
        df.iloc[i, 3] = set_source
        
    return df

In [23]:
tf_idf = set_doc(tf_idf)
tf_idf.head(10)

Unnamed: 0,emma_threshold,persuasion_threshold,sense_threshold,source
delight,1,1,1,multiple
effect,1,1,1,multiple
scarcely,1,1,1,multiple
word,1,1,1,multiple
bless,0,1,0,persuasion
arrive,0,1,1,multiple
look,1,1,1,multiple
fortune,1,1,1,multiple
longer,1,1,1,multiple
servant,1,1,1,multiple


In [24]:
rfc = ensemble.RandomForestClassifier()
Y = tf_idf['source']
X = tf_idf.drop(['source'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=0.5)
train = rfc.fit(X_train, y_train)
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 1.0

Test set score: 1.0




In [25]:
cross_val_score(rfc, X, Y, cv=5)

array([1., 1., 1., 1., 1.])

In [26]:
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(514, 3) (514,)
Training set score: 0.980544747081712

Test set score: 0.9883268482490273




In [27]:
cross_val_score(lr, X, Y, cv=5)



array([0.98076923, 0.98067633, 0.9804878 , 1.        , 0.98039216])

In [28]:
train = clf.fit(X_train, y_train)
print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 1.0

Test set score: 1.0


In [29]:
cross_val_score(lr, X, Y, cv=5)



array([0.98076923, 0.98067633, 0.9804878 , 1.        , 0.98039216])