In [50]:
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
from sklearn import svm, datasets

from spacy.lang.en import English

In [51]:
poems = pd.read_csv('poems.csv')
english_poems = poems.loc[poems.country.isin(['American', 'English'])]
mask = (english_poems.groupby('author')['author'].transform(len) > 49)
english_poems = english_poems[mask]

In [56]:
nlp = English()

def lemmatize_poem(poem):
    doc = nlp(poem.lower())
    return " ".join(token.lemma_ for token in doc)

english_poems["lemmas"] = english_poems.body.apply(lemmatize_poem)

In [57]:
english_poems.head()

Unnamed: 0,title,body,author,country,lemmas
1,Villanelle,"Time can say nothing but I told you so,\nTime ...",W. H. Auden,English,"time can say nothing but i tell you so , \n ti..."
10,"Think No More, Lad","Think no more, lad; laugh, be jolly:\nWhy shou...",A. E. Housman,English,"think no much , lad ; laugh , be jolly : \n wh..."
13,Buffalo Dusk,THE BUFFALOES are gone.\nAnd those who saw the...,Carl Sandburg,American,the buffalo be go . \n and that who see the bu...
19,Ode For An Agricultural Celebration,"Far back in the ages,\nThe plough with wreaths...",William Cullen Bryant,American,"far back in the age , \n the plough with wreat..."
24,"Star Light, Star Bright--","Star, that gives a gracious dole,\nWhat am I t...",Dorothy Parker,American,"star , that give a gracious dole , \n what be ..."


In [83]:
train_idx, test_idx = train_test_split(english_poems.index, test_size=0.2, random_state=4, stratify=english_poems.author)

poems_train = english_poems.body.loc[train_idx]
lemma_poems_train = english_poems.lemmas.loc[train_idx]

poems_test = english_poems.body.loc[test_idx]
lemma_poems_test = english_poems.lemmas.loc[test_idx]

vectorizer = TfidfVectorizer()
vectorizer = vectorizer.fit(poems_train.values)

lemma_vectorizer = TfidfVectorizer()
lemma_vectorizer = vectorizer.fit(lemma_poems_train.values)

features_train = vectorizer.transform(poems_train.values)
features_test = vectorizer.transform(poems_test.values)

lemma_features_train = lemma_vectorizer.transform(lemma_poems_train.values)
lemma_features_test = lemma_vectorizer.transform(lemma_poems_test.values)

features_train = np.concatenate((features_train.toarray(), lemma_features_train.toarray()), axis=1)
features_test = np.concatenate((features_test.toarray(), lemma_features_test.toarray()), axis=1)

author_train = english_poems.author.loc[train_idx]
author_test = english_poems.author.loc[test_idx]

In [85]:
print(lemma_features_train.shape)
features_train.shape

(1480, 17752)


(1480, 35504)

In [86]:
C = 1.0  # SVM regularization parameter
models = (svm.SVC(kernel='linear', C=C),
          svm.LinearSVC(C=C, max_iter=10000),
          svm.SVC(kernel='rbf', gamma=0.85, C=C),
          svm.SVC(kernel='poly', degree=3, gamma='auto', C=C))
models = [clf.fit(features_train, author_train) for clf in models]

print('SVC with linear kernel')
svc_predictions = models[0].predict(features_test)
print("Test set Accuracy: ", accuracy_score(author_test, svc_predictions))

print('LinearSVC (linear kernel)')
linear_svc_predictions = models[1].predict(features_test)
print("Test set Accuracy: ", accuracy_score(author_test, linear_svc_predictions))

print('SVC with RBF kernel')
rbf_kernel_predictions = models[2].predict(features_test)
print("Test set Accuracy: ", accuracy_score(author_test, rbf_kernel_predictions))

print('SVC with polynomial (degree 3) kernel')
poly_kernel_predictions = models[3].predict(features_test)
print("Test set Accuracy: ", accuracy_score(author_test, poly_kernel_predictions))

SVC with linear kernel
Test set Accuracy:  0.5621621621621622
LinearSVC (linear kernel)
Test set Accuracy:  0.6
SVC with RBF kernel
Test set Accuracy:  0.44324324324324327
SVC with polynomial (degree 3) kernel
Test set Accuracy:  0.1972972972972973


In [34]:
linear_svc_predictions

array(['Anne Sexton', 'William Blake', 'John Keats', 'Anne Sexton',
       'John Milton', 'William Ernest Henley', 'Charles Bukowski',
       'William Blake', 'Edward Lear', 'Anne Sexton', 'Edward Lear',
       'Robert Herrick', 'Edward Lear', 'Carl Sandburg',
       'Louisa May Alcott', 'W. S. Merwin', 'William Cowper',
       'Robert Frost', 'Walter Savage Landor', 'Robert Herrick',
       'William Shakespeare', 'Robert Herrick', 'William Cullen Bryant',
       'Christina Rossetti', 'Thomas Hardy', 'Robert Herrick',
       'Lord Byron', 'Robert Herrick', 'Charles Bukowski',
       'Dante Gabriel Rossetti', 'Lord Byron', 'Alfred Lord Tennyson',
       'Charles Bukowski', 'Edgar Allan Poe', 'W. H. Auden',
       'Dante Gabriel Rossetti', 'Carl Sandburg', 'William Shakespeare',
       'Louisa May Alcott', 'Anne Sexton', 'Wallace Stevens',
       'Sylvia Plath', 'W. H. Auden', 'Sylvia Plath', 'Conrad Aiken',
       'Ralph Waldo Emerson', 'John Donne', 'Edgar Allan Poe',
       'Dorothy P

In [35]:
author_test

2711                    John Keats
4374                 William Blake
1057                    John Keats
3755                   Anne Sexton
1907                   John Milton
2408         William Ernest Henley
4247              Charles Bukowski
1996                 William Blake
5460                   Edward Lear
1351                  Thomas Hardy
7229                   Anne Sexton
7292                Robert Herrick
6253                   Edward Lear
2560                 Carl Sandburg
5387          Walter Savage Landor
2693                  W. S. Merwin
144                 William Cowper
7001                Dorothy Parker
5424                 Sara Teasdale
5630         William Ernest Henley
3149           William Shakespeare
1061             Louisa May Alcott
1954         William Cullen Bryant
3120                 Sara Teasdale
7350                  Thomas Hardy
3481                    John Donne
858                     Lord Byron
5284                    John Keats
6220              Ch

In [41]:
from sklearn.metrics import confusion_matrix
classes = list(set(author_test))
cm = confusion_matrix(author_test, linear_svc_predictions, labels=classes)
df = pd.DataFrame(cm, columns=classes, index=classes)
df

Unnamed: 0,William Blake,A. E. Housman,William Wordsworth,Walter Savage Landor,William Ernest Henley,John Donne,William Cowper,W. S. Merwin,William Shakespeare,Edna St. Vincent Millay,...,Edward Lear,Alfred Lord Tennyson,John Milton,Sara Teasdale,Edgar Allan Poe,Thomas Hardy,Lord Byron,Anne Sexton,Henry Wadsworth Longfellow,Conrad Aiken
William Blake,7,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
A. E. Housman,0,5,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
William Wordsworth,0,0,5,0,0,0,0,0,0,0,...,0,0,0,0,1,0,2,0,0,0
Walter Savage Landor,0,1,0,7,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
William Ernest Henley,1,0,0,0,4,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
John Donne,0,0,0,0,0,8,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
William Cowper,0,0,0,1,0,0,8,0,1,0,...,0,0,0,0,0,0,0,0,0,0
W. S. Merwin,0,0,0,0,0,0,0,9,0,0,...,0,0,0,0,0,0,0,0,0,0
William Shakespeare,0,0,0,0,0,1,0,0,8,0,...,0,0,0,0,0,0,0,0,0,0
Edna St. Vincent Millay,0,1,0,0,0,0,0,0,0,2,...,0,1,0,0,0,1,0,0,0,0


In [13]:
# viewing incorrect predictions
error_mask = linear_svc_predictions != author_test
error_idx = test_idx[error_mask]
error_poems = english_poems.loc[error_idx]

# actual, predicted
mistakes = list(zip(error_poems["author"], linear_svc_predictions[error_mask]))
len(mistakes)

370

In [42]:
df['Edna St. Vincent Millay']

William Blake                 0
A. E. Housman                 0
William Wordsworth            0
Walter Savage Landor          0
William Ernest Henley         1
John Donne                    0
William Cowper                0
W. S. Merwin                  0
William Shakespeare           0
Edna St. Vincent Millay       2
John Keats                    0
Sylvia Plath                  0
Emily Dickinson               0
Elizabeth Barrett Browning    0
Louisa May Alcott             0
Robert Frost                  0
Dante Gabriel Rossetti        0
Charles Bukowski              0
Christina Rossetti            0
W. H. Auden                   0
Dorothy Parker                0
Ralph Waldo Emerson           0
Wallace Stevens               0
Robert Herrick                0
William Cullen Bryant         0
Carl Sandburg                 0
E. E. Cummings                0
Edward Lear                   0
Alfred Lord Tennyson          1
John Milton                   0
Sara Teasdale                 1
Edgar Al

In [43]:
df.loc['Edna St. Vincent Millay']

William Blake                 0
A. E. Housman                 1
William Wordsworth            0
Walter Savage Landor          0
William Ernest Henley         0
John Donne                    0
William Cowper                0
W. S. Merwin                  0
William Shakespeare           0
Edna St. Vincent Millay       2
John Keats                    0
Sylvia Plath                  0
Emily Dickinson               0
Elizabeth Barrett Browning    0
Louisa May Alcott             0
Robert Frost                  0
Dante Gabriel Rossetti        0
Charles Bukowski              1
Christina Rossetti            1
W. H. Auden                   0
Dorothy Parker                1
Ralph Waldo Emerson           1
Wallace Stevens               0
Robert Herrick                0
William Cullen Bryant         0
Carl Sandburg                 1
E. E. Cummings                0
Edward Lear                   0
Alfred Lord Tennyson          1
John Milton                   0
Sara Teasdale                 0
Edgar Al

In [47]:
for poem in english_poems[english_poems.author == 'Edna St. Vincent Millay'].body:
    print(poem)
    print(" ")
    print(" ")
    print("---------")

All I could see from where I stood
Was three long mountains and a wood;
I turned and looked another way,
And saw three islands in a bay.
So with my eyes I traced the line
Of the horizon, thin and fine,
Straight around till I was come
Back to where I’d started from;
And all I saw from where I stood
Was three long mountains and a wood.
Over these things I could not see;
These were the things that bounded me;
And I could touch them with my hand,
Almost, I thought, from where I stand.
And all at once things seemed so small
My breath came short, and scarce at all.
But, sure, the sky is big, I said;
Miles and miles above my head;
So here upon my back I’ll lie
And look my fill into the sky.
And so I looked, and, after all,
The sky was not so very tall.
The sky, I said, must somewhere stop,
And—sure enough!—I see the top!
The sky, I thought, is not so grand;
I ‘most could touch it with my hand!
And reaching up my hand to try,
I screamed to feel it touch the sky.
I screamed, and—lo!—Infinity
Ca

In [14]:
len(author_test)

699

In [30]:
english_poems.head()

Unnamed: 0,title,body,author,country
1,Villanelle,"Time can say nothing but I told you so,\nTime ...",W. H. Auden,English
10,"Think No More, Lad","Think no more, lad; laugh, be jolly:\nWhy shou...",A. E. Housman,English
13,Buffalo Dusk,THE BUFFALOES are gone.\nAnd those who saw the...,Carl Sandburg,American
19,Ode For An Agricultural Celebration,"Far back in the ages,\nThe plough with wreaths...",William Cullen Bryant,American
24,"Star Light, Star Bright--","Star, that gives a gracious dole,\nWhat am I t...",Dorothy Parker,American


In [33]:
for p in english_poems[(english_poems.author == "Conrad Aiken")].body:
    print(p)
    print("------")
    print(" ")

Death himself in the rain . . . death himself . . .
Death in the savage sunlight . . . skeletal death . . .
I hear the clack of his feet,
Clearly on stones, softly in dust;
He hurries among the trees
Whirling the leaves, tossing he hands from waves.
Listen! the immortal footsteps beat.
Death himself in the grass, death himself,
Gyrating invisibly in the sun,
Scatters the grass-blades, whips the wind,
Tears at boughs with malignant laughter:
On the long echoing air I hear him run.
Death himself in the dusk, gathering lilacs,
Breaking a white-fleshed bough,
Strewing purple on a cobwebbed lawn,
Dancing, dancing,
The long red sun-rays glancing
On flailing arms, skipping with hideous knees
Cavorting grotesque ecstasies:
I do not see him, but I see the lilacs fall,
I hear the scrape of knuckles against the wall,
The leaves are tossed and tremble where he plunges among them,
And I hear the sound of his breath,
Sharp and whistling, the rythm of death.
It is evening: the lights on a long street