## Homework #5
### Using LSA to train a classifier for character gender recognition in Shakespeare plays.

In [59]:
import os
import os.path
import shutil
import numpy as np
import pandas as pd
import random
import pickle

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from sklearn.externals import joblib
from sklearn.externals.joblib import Memory

import nltk
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

from imblearn.over_sampling import SMOTE 

import matplotlib.pyplot as plt
%matplotlib inline

We first need to download punkt, our sentence tokenizer, and also a dictionary of stopwords that we'll use later to remove stopwords from the texts.

In [2]:
%%capture
nltk.download('punkt')
nltk.download('stopwords')

We'll also need to find and assign genders to all characters and their associated texts. We'll use [this list of female characters](http://www.shakespeareswords.com/Special-Features-Female-Characters).

In [3]:
female_characters_df = pd.read_csv("female_characters.csv")
female_characters_df.character = female_characters_df.character.str.lower()

In [4]:
female_characters_df.sample(n= 3)

Unnamed: 0,character,play_name,gender,replics
93,katharine,Love's Labours Lost,Female,46
164,daughter,"Pericles, Prince of Tyre",Female,2
12,daughter,The Two Noble Kinsmen,Female,324


Now we'll de-serialize the data streams from our pickle file, which contain all texts from all of Shakespeare's plays along with the associated play, speaker, act/scene, and play genre.

In [5]:
with open('shakespeare_plays.pickle', 'rb') as handle:
    speeches = pickle.load(handle)

In [6]:
speeches_df = pd.DataFrame(speeches)
speeches_df.speaker = speeches_df.speaker.str.lower()

In [7]:
speeches_df.sample(n = 3)

Unnamed: 0,act,genre,play_name,scene,scene_name,speaker,speech_number,speech_text
9222,2,Comedy,Taming of the Shrew,1,Padua. A room in BAPTISTA'S house.,katharina,57,"No such jade as you, if me you mean."
16417,3,History,Richard II,2,The coast of Wales. A castle in view.,duke of aumerle,28,"My liege, one word."
16682,5,History,Richard II,6,Windsor castle.,henry bolingbroke,5,"Thy pains, Fitzwater, shall not be forgot;\nRi..."


We'll also double check that our list of plays are the same as the plays listed in the genders dataframe.

In [8]:
our_names = set(speeches_df.play_name)
print(our_names)

{"All's Well That Ends Well", "Love's Labours Lost", 'Two Gentlemen of Verona', 'Troilus and Cressida', 'Macbeth', 'Othello', 'Henry V', 'Coriolanus', 'Richard III', 'King John', 'The Merchant of Venice', 'TheMerry Wives of Windsor', 'Titus Andronicus', 'Timon of Athens', 'Julius Caesar', 'Twelfth Night', "Winter's Tale", 'The Comedy of Errors', 'Henry VIII', "A Midsummer Night's Dream", 'Much Ado About Nothing', 'The Tempest', 'Pericles, Prince of Tyre', 'As You Like It', 'Taming of the Shrew', 'Measure for Measure', 'Hamlet', 'Romeo and Juliet', 'Richard II', 'Antony and Cleopatra', 'King Lear', 'Cymbeline'}


In [9]:
their_names = set(female_characters_df.play_name.unique())

In [10]:
print(their_names)

{"All's Well That Ends Well", "Love's Labours Lost", 'Two Gentlemen of Verona', 'Troilus and Cressida', 'Othello', 'Macbeth', 'The Two Noble Kinsmen', 'Henry V', 'Coriolanus', 'Richard III', 'King John', 'The Merchant of Venice', 'TheMerry Wives of Windsor', 'Titus Andronicus', 'Timon of Athens', 'Julius Caesar', 'Twelfth Night', "Winter's Tale", 'The Comedy of Errors', 'Henry VIII', "A Midsummer Night's Dream", 'Much Ado About Nothing', 'Henry VI', 'The Tempest', 'Pericles, Prince of Tyre', 'As You Like It', 'Taming of the Shrew', 'Measure for Measure', 'Hamlet', 'Romeo and Juliet', 'Richard II', 'Antony and Cleopatra', 'King Edward III', 'Henry IV', 'King Lear', 'Cymbeline'}


Now we check to see if our texts include any plays that aren't included in the genders dataframe and note that there are none.

In [11]:
print(our_names - their_names)

set()


On the other hand, it looks like the additional plays in the genders dataframe are those that Shakespeare co-authored, so looks we're looking okay.

In [12]:
print(their_names - our_names)

{'Henry IV', 'King Edward III', 'Henry VI', 'The Two Noble Kinsmen'}


Now we create sets of our speakers and add a 'female' column to the dataset.

In [13]:
our_speakers = set(speeches_df.speaker)
their_speakers = set(female_characters_df.character)

speeches_df['female'] = speeches_df.apply(lambda r : r['speaker'] in their_speakers, axis=1)

We also note that some of the texts are spoken by multiple people:

In [14]:
speeches_df[speeches_df.speaker.str.startswith('all')][0:3]

Unnamed: 0,act,genre,play_name,scene,scene_name,speaker,speech_number,speech_text,female
268,2,Comedy,All's Well That Ends Well,3,Paris. The KING's palace.,all,34,"We understand it, and thank heaven for you.",False
587,4,Comedy,All's Well That Ends Well,1,Without the Florentine camp.,all,26,"Cargo, cargo, cargo, villiando par corbo, cargo.",False
3062,5,Comedy,Cymbeline,4,A British prison.,all,19,"Thanks, Jupiter!",False


Because we are examining differences in male/female speech in this analysis, we will discard all text spoken by multiple people.

In [15]:
speeches_df.drop(speeches_df[speeches_df.speaker.str.startswith('all')].index, inplace = True)
speeches_df.sample(n = 3)

Unnamed: 0,act,genre,play_name,scene,scene_name,speaker,speech_number,speech_text,female
15272,3,History,Henry VIII,2,Ante-chamber to KING HENRY VIII's apartment.,suffolk,31,'Tis so.\nThe cardinal!\nEnter CARDINAL WOLSEY...,False
9831,5,Comedy,Taming of the Shrew,2,Padua. LUCENTIO'S house.,lucentio,90,But a harsh hearing when women are froward.,False
17300,3,History,Richard III,4,The Tower of London.,hastings,13,I thank your grace.,False


Now that we have a dataframe with all the data that we need, we turn our attention to the imbalance in male/female texts. We will have to take this into account when fitting the model.

In [16]:
speeches_df.groupby(['female']).size()

female
False    22275
True      4728
dtype: int64

In [17]:
# 0 - male, 1 - female
labels = [ 1 if f else 0 for f in speeches_df.female.values ]

Now we can perform text preprocessing, steming and tokenizing our words.

This will not used in the final version as it makes CountVectorizer very slow. The built in functionality will be used instead.

In [18]:
stemmer = PorterStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

Below we note that we've successfully tokenized and stemmed our words.'

In [19]:
t = speeches_df.loc[334, 'speech_text']
print(t)

Although before the solemn priest I have sworn,
I will not bed her.


In [20]:
print(tokenize(t))

['although', 'befor', 'the', 'solemn', 'priest', 'I', 'have', 'sworn', ',', 'I', 'will', 'not', 'bed', 'her', '.']


Finally, we can begin the analysis, splitting our features into test and train sets.

In [21]:
features = speeches_df['speech_text'].values

In [22]:
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    labels, 
                                                    stratify = labels, 
                                                    test_size = 0.10, 
                                                    random_state = 100)

Now we can create a feature space, perform dimension reduction, and search for the best preforming parameters. We'll try three models: an SGDClassifier, RandomForestClassifier, and KNeighborsClassifier.

In [63]:
shutil.rmtree('pipeline_sgd', ignore_errors = True)
os.makedirs('pipeline_sgd')

pipe_sgd = Pipeline(
    memory = 'pipeline',
    steps=[
        # Create feature space
        ('vect', CountVectorizer(min_df=2, 
                                 stop_words='english', 
                                 lowercase=True,
                                 strip_accents='unicode')),
                                 #ngram_range=(2,3))
        ('tfidf', TfidfTransformer()),        
        # Perform LSA on the features
        ('svd', TruncatedSVD()),
        # faster than SVC, default loss is 'hinge'
        ('clf', SGDClassifier(class_weight= 'balanced', 
                              verbose = 0, 
                              n_jobs = -1, 
                              max_iter = 1000))
    ]
)

param_grid_sgd = {
    'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams
    'tfidf__norm': ['l1', 'l2'],
    'svd__n_components': [250, 300, 350],
    'clf__alpha': [0.00001, 0.000001],
    'clf__penalty': ('l2', 'elasticnet')
}

model_sgd = GridSearchCV(
    pipe_sgd,
    param_grid = param_grid_sgd,
    cv = StratifiedKFold(random_state = 100),
    scoring = 'f1',
    verbose = 1,
    n_jobs = -1)

In [65]:
shutil.rmtree('pipeline_rf', ignore_errors = True)
os.makedirs('pipeline_rf')

pipe_rf = Pipeline(
    memory = 'pipeline',
    steps=[
        # Create feature space
        ('vect', CountVectorizer(min_df=2, 
                                 stop_words='english', 
                                 lowercase=True,
                                 strip_accents='unicode')),
                                 #ngram_range=(2,3))
        ('tfidf', TfidfTransformer()),        
        # Perform LSA on the features
        ('svd', TruncatedSVD()),
        ('rf', RandomForestClassifier(criterion='gini',
                                       min_samples_split=2, 
                                       min_samples_leaf=1, 
                                       min_weight_fraction_leaf=0.0, 
                                       max_features='auto', 
                                       max_leaf_nodes=None, 
                                       min_impurity_decrease=0.0, 
                                       min_impurity_split=None, 
                                       bootstrap=True, 
                                       oob_score=False, 
                                       n_jobs=1, 
                                       random_state=None, 
                                       verbose=0, 
                                       warm_start=False, 
                                       class_weight=None))
    ]
)

param_grid_rf = {
    'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams
    'tfidf__norm': ['l1', 'l2'],
    'svd__n_components': [250, 300, 350],
    'rf__n_estimators':[5,10,15]
}

model_rf = GridSearchCV(
    pipe_rf,
    param_grid = param_grid_rf,
    cv = StratifiedKFold(random_state = 100),
    scoring = 'f1',
    verbose = 1,
    n_jobs = -1)

In [64]:
shutil.rmtree('pipeline_knn', ignore_errors = True)
os.makedirs('pipeline_knn')

pipe_knn = Pipeline(
    memory = 'pipeline',
    steps=[
        # Create feature space
        ('vect', CountVectorizer(min_df=2, 
                                 stop_words='english', 
                                 lowercase=True,
                                 strip_accents='unicode')),
                                 #ngram_range=(2,3))
        ('tfidf', TfidfTransformer()),        
        # Perform LSA on the features
        ('svd', TruncatedSVD()),
        ('knn',KNeighborsClassifier(weights='uniform', 
                                    algorithm='auto', 
                                    p=2, 
                                    metric='minkowski', 
                                    metric_params=None, 
                                    n_jobs=1))
    ]
)

param_grid_knn = {
    'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams
    'tfidf__norm': ['l1', 'l2'],
    'svd__n_components': [250, 300, 350],
    'knn__n_neighbors':[3,4,5,6,7,8]
}

model_knn = GridSearchCV(
    pipe_knn,
    param_grid = param_grid_knn,
    cv = StratifiedKFold(random_state = 100),
    scoring = 'f1',
    verbose = 1,
    n_jobs = -1)

In [66]:
model_sgd = model_sgd.fit(X_train, y_train)
model_rf = model_rf.fit(X_train, y_train)
model_knn = model_knn.fit(X_train, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed: 23.7min finished


Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:  4.5min finished


Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 50.9min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 246.2min
[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed: 280.4min finished


We can take a look at how the model's performance varied with different parameters:

In [81]:
results_sgd = pd.DataFrame({'mean_test_score': np.array(model_sgd.cv_results_['mean_test_score']),
                        'tfidf__norm': np.array(model_sgd.cv_results_['param_tfidf__norm']),
                        'svd__n_components': np.array(model_sgd.cv_results_['param_svd__n_components']),
                        'clf__alpha': np.array(model_sgd.cv_results_['param_clf__alpha']),
                        'clf__penalty': np.array(model_sgd.cv_results_['param_clf__penalty'])})

results_rf = pd.DataFrame({'mean_test_score': np.array(model_rf.cv_results_['mean_test_score']),
                        'tfidf__norm': np.array(model_rf.cv_results_['param_tfidf__norm']),
                        'svd__n_components': np.array(model_rf.cv_results_['param_svd__n_components']),
                        'rf__n_estimators': np.array(model_rf.cv_results_['param_rf__n_estimators'])})

results_knn = pd.DataFrame({'mean_test_score': np.array(model_knn.cv_results_['mean_test_score']),
                        'tfidf__norm': np.array(model_knn.cv_results_['param_tfidf__norm']),
                        'svd__n_components': np.array(model_knn.cv_results_['param_svd__n_components']),
                        'knn__n_neighbors': np.array(model_knn.cv_results_['param_knn__n_neighbors'])})

print(results_sgd.head())
print(results_rf.head())
print(results_knn.head())

  clf__alpha clf__penalty  mean_test_score svd__n_components tfidf__norm
0      1e-05           l2         0.283419               250          l1
1      1e-05           l2         0.251819               250          l1
2      1e-05           l2         0.321952               250          l2
3      1e-05           l2         0.325283               250          l2
4      1e-05           l2         0.289454               300          l1
   mean_test_score rf__n_estimators svd__n_components tfidf__norm
0         0.131355                5               250          l1
1         0.145271                5               250          l1
2         0.141113                5               250          l2
3         0.138767                5               250          l2
4         0.131378                5               300          l1
  knn__n_neighbors  mean_test_score svd__n_components tfidf__norm
0                3         0.152070               250          l1
1                3         0.15225

And we can see the best set of parameters for each of the models:

In [87]:
print("The best parameters are %s with a score of %0.2f" % (model_sgd.best_params_, model_sgd.best_score_))
print()
print("The best parameters are %s with a score of %0.2f" % (model_rf.best_params_, model_rf.best_score_))
print()
print("The best parameters are %s with a score of %0.2f" % (model_knn.best_params_, model_knn.best_score_))

The best parameters are {'clf__alpha': 1e-05, 'clf__penalty': 'elasticnet', 'svd__n_components': 350, 'tfidf__norm': 'l2', 'vect__ngram_range': (1, 1)} with a score of 0.34

The best parameters are {'rf__n_estimators': 5, 'svd__n_components': 250, 'tfidf__norm': 'l1', 'vect__ngram_range': (1, 2)} with a score of 0.15

The best parameters are {'knn__n_neighbors': 3, 'svd__n_components': 350, 'tfidf__norm': 'l2', 'vect__ngram_range': (1, 2)} with a score of 0.16


And finally we can test on our test data and compare the results of the best set of parameters for each of our models.

In [83]:
y_hat_sgd = model_sgd.predict(X_test)
y_hat_rf = model_rf.predict(X_test)
y_hat_knn = model_knn.predict(X_test)

In [84]:
print(classification_report(y_test, y_hat_sgd))

             precision    recall  f1-score   support

          0       0.86      0.65      0.74      2228
          1       0.23      0.49      0.31       473

avg / total       0.75      0.62      0.66      2701



In [85]:
print(classification_report(y_test, y_hat_rf))

             precision    recall  f1-score   support

          0       0.83      0.93      0.88      2228
          1       0.22      0.09      0.13       473

avg / total       0.72      0.78      0.75      2701



In [86]:
print(classification_report(y_test, y_hat_knn))

             precision    recall  f1-score   support

          0       0.83      0.93      0.87      2228
          1       0.19      0.08      0.12       473

avg / total       0.72      0.78      0.74      2701

