In [1]:
from nltk.classify.api import ClassifierI, MultiClassifierI
from nltk.classify.megam import config_megam, call_megam
from nltk.classify.weka import WekaClassifier, config_weka
from nltk.classify.naivebayes import NaiveBayesClassifier
from nltk.classify.positivenaivebayes import PositiveNaiveBayesClassifier
from nltk.classify.decisiontree import DecisionTreeClassifier
from nltk.classify.rte_classify import rte_classifier, rte_features, RTEFeatureExtractor
from nltk.classify.util import accuracy, apply_features, log_likelihood
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.classify.maxent import (MaxentClassifier, BinaryMaxentFeatureEncoding,
                                  TypedMaxentFeatureEncoding,
                                  ConditionalExponentialClassifier)

### Using NLTK Name corpus to train a Gender Identification classifier

In [17]:
import nltk.corpus
from nltk.corpus import names

In [18]:
import random
names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)
print('Total names:',len(names))
print(names[0:10])

Total names: 7944
[('Hewitt', 'male'), ('Ewart', 'male'), ('Carita', 'female'), ('Rockwell', 'male'), ('Sanderson', 'male'), ('Marris', 'female'), ('Horst', 'male'), ('Ioana', 'female'), ('Zulema', 'female'), ('Ambrose', 'male')]


In [20]:
#The most important thing for a text classifier is feature, which can be very flexible, and defined by human engineer. Here, we just use the final letter of a given name as the feature, and build a dictionary containing relevant information about a given name:
def gender_features(word):
    return {'last_letter': word[-1]}
gender_features('Gary')

{'last_letter': 'y'}

In [23]:
featuresets = [(gender_features(n), g) for (n, g) in names]
print('Length of feature sets is',len(featuresets))
print(featuresets[0:10])
train_set, test_set = featuresets[500:], featuresets[:500]
print('Length of training set is',len(train_set))
print('Length of testing set is ',len(test_set))


Length of feature sets is 7944
[({'last_letter': 't'}, 'male'), ({'last_letter': 't'}, 'male'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'l'}, 'male'), ({'last_letter': 'n'}, 'male'), ({'last_letter': 's'}, 'female'), ({'last_letter': 't'}, 'male'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'e'}, 'male')]
Length of training set is 7444
Length of testing set is  500


###  Training a Naive Bayes classifier for Gender Identification:

In [26]:
#A learning algorithm is very useful for a classifier, here we will show you how to use the Naive Bayes.
from nltk import NaiveBayesClassifier
nb_classifier = NaiveBayesClassifier.train(train_set)
nb_classifier.classify(gender_features('Gary'))
nb_classifier.classify(gender_features('Grace'))
from nltk import classify
print('Accuracy using Naive Bayes Classifier is',classify.accuracy(nb_classifier, test_set))
nb_classifier.show_most_informative_features(5)




Accuracy using Naive Bayes Classifier is 0.744
Most Informative Features
             last_letter = 'a'            female : male   =     38.0 : 1.0
             last_letter = 'k'              male : female =     32.6 : 1.0
             last_letter = 'p'              male : female =     21.2 : 1.0
             last_letter = 'f'              male : female =     16.1 : 1.0
             last_letter = 'v'              male : female =     10.6 : 1.0


In [None]:
Using Gutenberg and Web_text data. Finding what are the top 5 words that Shakespeare used but we do not use in currently. -Take top 50 words from Shakespeare (all 3 books) and top 50 from Web_text (all the records). 

In [4]:
import nltk
from nltk.corpus import gutenberg
from nltk.corpus import webtext

In [2]:
File = []
File.extend(gutenberg.words('shakespeare-caesar.txt'))
File.extend(gutenberg.words('shakespeare-hamlet.txt'))
File.extend(gutenberg.words('shakespeare-macbeth.txt'))
from collections import Counter
shakespeare = [F.lower() for F in File]
from nltk.corpus import stopwords
filter_file = [word for word in shakespeare if word not in stopwords.words('english')]
sh = []
for h in filter_file:
    if h.isalpha():
        sh.append(h)
#print(sh)
from collections import Counter
shakespeare = Counter(sh)
top_50_shakespeare = shakespeare.most_common(50)
print(" Top 50 words from Shakespeare :\n",top_50_shakespeare)

 Top 50 words from Shakespeare :
 [('haue', 448), ('ham', 337), ('thou', 312), ('shall', 300), ('lord', 293), ('come', 232), ('king', 231), ('enter', 230), ('good', 218), ('let', 217), ('thy', 202), ('caesar', 193), ('vs', 184), ('know', 176), ('thee', 174), ('would', 170), ('vpon', 162), ('brutus', 162), ('like', 162), ('bru', 153), ('well', 152), ('hath', 144), ('selfe', 143), ('man', 139), ('may', 138), ('macb', 137), ('yet', 136), ('heere', 135), ('must', 130), ('say', 130), ('tis', 129), ('th', 125), ('make', 119), ('speake', 119), ('loue', 119), ('giue', 118), ('see', 116), ('time', 115), ('sir', 114), ('night', 114), ('one', 112), ('st', 110), ('cassi', 107), ('ile', 106), ('doe', 103), ('hamlet', 100), ('go', 100), ('men', 96), ('hor', 95), ('vp', 94)]
