#### Using LSA to train a classifier for character gender recognition in Shakespeare plays.

In [1]:
import os
import os.path
import shutil
import numpy as np
import pandas as pd
import random
import pickle

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from sklearn.externals import joblib
from sklearn.externals.joblib import Memory

import nltk
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

print('NTLK version', nltk.__version__)

import matplotlib.pyplot as plt
%matplotlib inline

NTLK version 3.2.3


In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Windows\ServiceProfiles\Lo
[nltk_data]     calService\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Windows\ServiceProfile
[nltk_data]     s\LocalService\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Prepare labels (male / female)

[Female characters](http://www.shakespeareswords.com/Special-Features-Female-Characters)

In [3]:
female_characters_df = pd.read_csv("female_characters.csv")
female_characters_df.character = female_characters_df.character.str.lower()

In [4]:
female_characters_df.sample(n= 10)

Unnamed: 0,character,play_name,gender,replics
69,dionyza,"Pericles, Prince of Tyre",Female,88
80,margaret,Much Ado About Nothing,Female,68
141,juliet,Measure for Measure,Female,10
53,duchess of york,Richard III,Female,141
50,miranda,The Tempest,Female,153
165,all the ladies,Timon of Athens,Female,1
34,katherina,Taming of the Shrew,Female,221
25,constance,King John,Female,264
157,hostess,Taming of the Shrew,Female,4
11,mistress page,TheMerry Wives of Windsor,Female,328


In [5]:
with open('shakespeare_plays.pickle', 'rb') as handle:
    speeches = pickle.load(handle)

In [6]:
speeches_df = pd.DataFrame(speeches)
speeches_df.speaker = speeches_df.speaker.str.lower()

In [7]:
speeches_df.sample(n = 10)

Unnamed: 0,act,genre,play_name,scene,scene_name,speaker,speech_number,speech_text
20324,1,Tragedy,Hamlet,5,Another part of the platform.,ghost,64,[Beneath] Swear.
3136,5,Comedy,Cymbeline,5,Cymbeline's tent.,imogen,56,"Peace, my lord; hear, hear--"
4758,3,Comedy,Measure for Measure,2,The street before the prison.,lucio,24,"No, indeed, will I not, Pompey; it is not the ..."
4011,5,Comedy,Love's Labours Lost,2,The same.,boyet,151,"They will, they will, God knows,\nAnd leap for..."
9804,5,Comedy,Taming of the Shrew,2,Padua. LUCENTIO'S house.,hortensio,63,I know her answer.
15151,2,History,Henry VIII,3,An ante-chamber of the QUEEN'S apartments.,anne,7,"By my troth and maidenhead,\nI would not be a ..."
1368,3,Comedy,As You Like It,2,The forest.,orlando,115,Who stays it still withal?
22260,1,Tragedy,King Lear,4,A hall in the same.,goneril,104,Safer than trust too far:\nLet me still take a...
6836,1,Comedy,A Midsummer Night's Dream,1,Athens. The palace of THESEUS.,theseus,13,Either to die the death or to abjure\nFor ever...
8651,4,Comedy,"Pericles, Prince of Tyre",1,Tarsus. An open place near the sea-shore.,first pirate,29,"Hold, villain!\nLEONINE runs away"


In [8]:
our_names = set(speeches_df.play_name)
print(our_names)

{'Macbeth', 'Coriolanus', "Winter's Tale", 'Richard II', "All's Well That Ends Well", "A Midsummer Night's Dream", 'Pericles, Prince of Tyre', 'Taming of the Shrew', 'Much Ado About Nothing', 'Titus Andronicus', 'Measure for Measure', 'As You Like It', 'Othello', 'The Comedy of Errors', 'King John', 'The Merchant of Venice', 'Antony and Cleopatra', 'Hamlet', 'Richard III', 'Timon of Athens', 'Henry V', 'King Lear', 'Julius Caesar', 'Twelfth Night', 'TheMerry Wives of Windsor', 'Two Gentlemen of Verona', "Love's Labours Lost", 'Henry VIII', 'Troilus and Cressida', 'Cymbeline', 'Romeo and Juliet', 'The Tempest'}


In [9]:
their_names = set(female_characters_df.play_name.unique())

In [10]:
print(their_names)

{'Macbeth', 'Coriolanus', "Winter's Tale", 'Richard II', "All's Well That Ends Well", "A Midsummer Night's Dream", 'Henry VI', 'Taming of the Shrew', 'Pericles, Prince of Tyre', 'Much Ado About Nothing', 'Titus Andronicus', 'Measure for Measure', 'As You Like It', 'Othello', 'The Comedy of Errors', 'King John', 'The Merchant of Venice', 'Antony and Cleopatra', 'Hamlet', 'Richard III', 'The Two Noble Kinsmen', 'King Lear', 'Henry V', 'Julius Caesar', 'Timon of Athens', 'Twelfth Night', 'TheMerry Wives of Windsor', 'Henry IV', 'Two Gentlemen of Verona', "Love's Labours Lost", 'Henry VIII', 'King Edward III', 'Troilus and Cressida', 'Cymbeline', 'Romeo and Juliet', 'The Tempest'}


Check the differences

In [11]:
print(our_names - their_names)

set()


In [12]:
print(their_names - our_names)

{'King Edward III', 'Henry VI', 'The Two Noble Kinsmen', 'Henry IV'}


It looks like the differences are only plays Shakespeare co-authored so looks OK.

In [13]:
our_speakers = set(speeches_df.speaker)
print('All characters')
print(our_speakers)

All characters
{'hernia', 'bastard', 'cato', 'peaseblossom', 'fourth soldier', 'first merchant', 'capulet', 'alexander', 'macmorris', 'juliet', 'francisca', 'priam', 'hymen', 'imogen', 'of buckingham', 'fourth messenger', 'trebonius', 'macbeth', 'lady  capulet', 'katharine', 'dercetas', 'lords', 'leontes', 'garter', 'elbow', 'second watchman', 'capucius', 'sheriff', 'proculeius', 'jupiter', 'second conspirator', 'mamillius', 'bawd', 'popilius', 'duke', 'cobweb', 'third messenger', 'ford', 'lucio', 'dumain', 'prince', 'juno', 'all the lords', 'boatswain', 'hector', 'diomedes', 'thurio', 'gentlewoman', 'third bandit', 'grumio', 'simple', 'stephano', 'alexas', 'snug', 'first officer', 'senators', 'hermia', 'christopher', 'miranda', 'philostrate', 'soothsayer', 'oswald', 'balthasar', 'first brother', 'valentine', 'essex', 'prologue', 'lavinia', 'second fisherman', 'first knight', 'dolabella', 'tamora', 'second commoner', 'first watchman', 'officer', 'verges', 'danes', 'mariner', 'armado', 

In [14]:
their_speakers = set(female_characters_df.character)
print('Female characters')
print(their_speakers)

Female characters
{'jessica', 'bianca', 'katherine', 'lady northumberland', 'doll', 'juliet', 'francisca', 'girl', 'iris', 'ghost of anne', 'virgilia', 'katharine', 'queen elizabeth and duchess of york', 'helenus', 'dionyza', 'daughter', 'ceres', 'adriana', 'katherina', 'mistress ford', 'volumnia', 'bawd', 'constance', 'phrynia and timandra', 'charmian', 'calphurnia', 'all witches', 'woman', 'juno', 'thaisa', 'queen margaret', 'regan', 'duchess', 'lychorida', 'gentlewoman', 'helena', 'hermia', 'nurse', 'miranda', 'old lady', 'nell', 'isabella', 'viola', 'lavinia', 'alice', 'tamora', 'lady capulet', 'abbess', 'all the ladies', 'princess', 'paulina', 'octavia', 'celia', 'julia', 'mistress quickly', 'marina', "queen's lady", 'hecat', 'andromache', 'lady faulconbridge', 'lady', 'mistress page', 'hero', 'queen elizabeth', 'iras', 'countess', 'second queen', 'rosalind', 'innogen', 'mopsa', 'pucelle', 'phebe', 'mistress quickly as queen of fairies', 'ladies', 'bona', 'margaret', 'cassandra', 

Add 'female' column to the dataset

In [15]:
speeches_df['female'] = speeches_df.apply(lambda r : r['speaker'] in their_speakers, axis=1)

Some speeaches are by a group of people

In [16]:
speeches_df[speeches_df.speaker.str.startswith('all')]

Unnamed: 0,act,genre,play_name,scene,scene_name,speaker,speech_number,speech_text,female
268,2,Comedy,All's Well That Ends Well,3,Paris. The KING's palace.,all,34,"We understand it, and thank heaven for you.",False
587,4,Comedy,All's Well That Ends Well,1,Without the Florentine camp.,all,26,"Cargo, cargo, cargo, villiando par corbo, cargo.",False
3062,5,Comedy,Cymbeline,4,A British prison.,all,19,"Thanks, Jupiter!",False
5692,3,Comedy,TheMerry Wives of Windsor,2,A street.,all,33,Have with you to see this monster.\nExeunt,False
6504,3,Comedy,The Merchant of Venice,2,Belmont. A room in PORTIA'S house.,all,10,"Ding, dong, bell.",False
6904,1,Comedy,A Midsummer Night's Dream,2,Athens. QUINCE'S house.,all,32,"That would hang us, every mother's son.",False
7030,3,Comedy,A Midsummer Night's Dream,1,The wood. TITANIA lying asleep.,all,55,Where shall we go?,False
8374,1,Comedy,"Pericles, Prince of Tyre",4,Tarsus. A room in the Governor's house.,all,19,The gods of Greece protect you!\nAnd we'll pra...,False
8508,2,Comedy,"Pericles, Prince of Tyre",4,Tyre. A room in the Governor's house.,all,14,"Live, noble Helicane!",False
8993,0,Comedy,Taming of the Shrew,2,A bedchamber in the Lord's house.,all,24,Amen.,False


We remove them from the data

In [17]:
speeches_df.drop(speeches_df[speeches_df.speaker.str.startswith('all')].index, inplace = True)

In [18]:
speeches_df.sample(n = 10)

Unnamed: 0,act,genre,play_name,scene,scene_name,speaker,speech_number,speech_text,female
24981,1,Tragedy,Romeo and Juliet,1,Verona. A public place.,benvolio,65,But new struck nine.,False
26497,5,Tragedy,Timon of Athens,1,The woods. Before Timon's cave.,first senator,69,I like this well; he will return again.,False
17658,4,History,Richard III,4,Before the palace.,stanley,160,"No, my good lord, my friends are in the north.",False
17556,4,History,Richard III,4,Before the palace.,king richard iii,58,All unavoided is the doom of destiny.,False
2450,1,Comedy,Cymbeline,4,Rome. Philario's house.,philario,2,You speak of him when he was less furnished th...,False
385,2,Comedy,All's Well That Ends Well,5,Paris. The KING's palace.,bertram,18,Is there any unkindness between my lord and yo...,False
19477,3,Tragedy,Coriolanus,1,Rome. A street.,brutus,58,"Why, shall the people give\nOne that speaks th...",False
799,5,Comedy,All's Well That Ends Well,1,Marseilles. A street.,widow,11,"Lord, how we lose our pains!",True
16689,1,History,Richard III,1,London. A street.,gloucester,1,Now is the winter of our discontent\nMade glor...,False
21677,3,Tragedy,Julius Caesar,2,The Forum.,antony,92,"Belike they had some notice of the people,\nHo...",False


The data is imbalanced we will have to take it into account when fitting the model.

In [19]:
speeches_df.groupby(['female']).size()

female
False    22275
True      4728
dtype: int64

In [20]:
# 0 - male, 1 - female
labels = [ 1 if f else 0 for f in speeches_df.female.values ]

Text preporcessing (not used in the final version as it makes CountVectorizer very slow), built in functionality is used instead

In [21]:
stemmer = PorterStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

In [22]:
t = speeches_df.loc[334, 'speech_text']
print(t)

Although before the solemn priest I have sworn,
I will not bed her.


In [23]:
print(tokenize(t))

['although', 'befor', 'the', 'solemn', 'priest', 'I', 'have', 'sworn', ',', 'I', 'will', 'not', 'bed', 'her', '.']


In [24]:
features = speeches_df['speech_text'].values

In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, stratify = labels, test_size = 0.10, random_state = 100
)

Search for best parameters

In [26]:
shutil.rmtree('pipeline', ignore_errors = True)
os.makedirs('pipeline')

pipe = Pipeline(
    memory = 'pipeline',
    steps=[
        # Create feature space
        ('vect', CountVectorizer(min_df=2, stop_words='english', lowercase=True)),
        ('tfidf', TfidfTransformer()),        
        # Perform LSA on the features
        ('svd', TruncatedSVD()),
        # faster than SVC, default loss is 'hinge'
        ('clf', SGDClassifier(class_weight= 'balanced', verbose = 0, n_jobs = -1, max_iter = 1000))
    ]
)

param_grid = {
    #'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams
    'tfidf__norm': ['l1', 'l2'],
    'svd__n_components': [200, 250, 300],
    'clf__alpha': [0.00001, 0.000001],
    'clf__penalty': ('l2', 'elasticnet')
}

model = GridSearchCV(
    pipe,
    param_grid = param_grid,
    cv = StratifiedKFold(random_state = 100),
    scoring = 'f1',
    verbose = 1,
    n_jobs = -1)

In [27]:
model = model.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed: 14.7min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed: 48.3min finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=100, shuffle=False),
       error_score='raise',
       estimator=Pipeline(memory='pipeline',
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        ...='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'clf__alpha': (1e-05, 1e-06), 'svd__n_components': [200, 250, 300], 'tfidf__norm': ('l1', 'l2'), 'clf__penalty': ('l2', 'elasticnet')},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='f1', verbose=1)

In [28]:
print("The best parameters are %s with a score of %0.2f" % (model.best_params_, model.best_score_))

The best parameters are {'tfidf__norm': 'l2', 'clf__penalty': 'l2', 'clf__alpha': 1e-05, 'svd__n_components': 250} with a score of 0.33


In [29]:
y_hat = model.predict(X_test)

In [30]:
print(classification_report(y_test, y_hat))

             precision    recall  f1-score   support

          0       0.86      0.58      0.69      2228
          1       0.22      0.55      0.31       473

avg / total       0.75      0.58      0.63      2701

