In [20]:
import nltk
from nltk.corpus import names
def gender_stats(filename:str):
    dataset_lenght = len(names.words(filename))
    first_10_names = names.words(filename)[:10]
    last_letters = [name[-1] for name in names.words(filename)]
    last_letter_freq = nltk.FreqDist(last_letters)

    print(f'Dataset lenght is: {dataset_lenght} \nThe first names of the dataset are: {first_10_names} \nLast letters of the names of this dataset are: {last_letter_freq.most_common(5)}')

In [21]:
#By using this text classifier we will demonstrate that male and female names have some distinctive characteristics. Names ending in a, e, and i are likely to be female names
gender_stats("female.txt")

Dataset lenght is: 5001 
The first names of the dataset are: ['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi', 'Abbie', 'Abby', 'Abigael', 'Abigail', 'Abigale'] 
Last letters of the names of this dataset are: [('a', 1773), ('e', 1432), ('y', 461), ('n', 386), ('i', 317)]


In [3]:
gender_stats("male.txt")

Dataset lenght is: 2943 
The first names of the dataset are: ['Aamir', 'Aaron', 'Abbey', 'Abbie', 'Abbot', 'Abbott', 'Abby', 'Abdel', 'Abdul', 'Abdulkarim'] 
Last letters of the names of this dataset are: [('n', 478), ('e', 468), ('y', 332), ('s', 230), ('d', 228)]


In [25]:
#In the above example, we defined a feature extractor function that returns the last letter of a given name.
#We used this function to extract the features from the names corpus and trained a Naive Bayes classifier.
from nltk.corpus import names
import random

def gender_features(word):
    return {'last_letter': word[-1]}

# preparing a list of examples and corresponding class labels.
names = ([(name, 'male') for name in names.words('male.txt')] +
         [(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)

# we use the feature extractor to process the names data.
featuresets = [(gender_features(n), g) for (n,g) in names]

# Divide the resulting list of feature
# sets into a training set and a test set.
train_set, test_set = featuresets[500:], featuresets[:500]

# The training set is used to
# train a new "naive Bayes" classifier.
classifier = nltk.NaiveBayesClassifier.train(train_set)

print(nltk.classify.accuracy(classifier, test_set))
print(classifier.show_most_informative_features(26))

0.77
Most Informative Features
             last_letter = 'k'              male : female =     43.0 : 1.0
             last_letter = 'a'            female : male   =     38.8 : 1.0
             last_letter = 'f'              male : female =     16.6 : 1.0
             last_letter = 'p'              male : female =     12.5 : 1.0
             last_letter = 'v'              male : female =     11.2 : 1.0
             last_letter = 'd'              male : female =      9.5 : 1.0
             last_letter = 'm'              male : female =      8.6 : 1.0
             last_letter = 'o'              male : female =      8.1 : 1.0
             last_letter = 'r'              male : female =      6.8 : 1.0
             last_letter = 'z'              male : female =      5.6 : 1.0
             last_letter = 'w'              male : female =      5.4 : 1.0
             last_letter = 'g'              male : female =      5.2 : 1.0
             last_letter = 't'              male : female =      4.1 

In [33]:
print(classifier.classify(gender_features('mahavir')))

female


In [30]:
classifier.classify(gender_features('Trinity'))

'female'

In [35]:
#However, there are usually limits to the number of features that you should use with a given learning algorithm — if you provide too many features, then the algorithm will have a higher chance of relying on idiosyncrasies of your training data that don't generalize well to new examples. This problem is known as overfitting
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features
gender_features2('John')

{'first_letter': 'j',
 'last_letter': 'n',
 'count(a)': 0,
 'has(a)': False,
 'count(b)': 0,
 'has(b)': False,
 'count(c)': 0,
 'has(c)': False,
 'count(d)': 0,
 'has(d)': False,
 'count(e)': 0,
 'has(e)': False,
 'count(f)': 0,
 'has(f)': False,
 'count(g)': 0,
 'has(g)': False,
 'count(h)': 1,
 'has(h)': True,
 'count(i)': 0,
 'has(i)': False,
 'count(j)': 1,
 'has(j)': True,
 'count(k)': 0,
 'has(k)': False,
 'count(l)': 0,
 'has(l)': False,
 'count(m)': 0,
 'has(m)': False,
 'count(n)': 1,
 'has(n)': True,
 'count(o)': 1,
 'has(o)': True,
 'count(p)': 0,
 'has(p)': False,
 'count(q)': 0,
 'has(q)': False,
 'count(r)': 0,
 'has(r)': False,
 'count(s)': 0,
 'has(s)': False,
 'count(t)': 0,
 'has(t)': False,
 'count(u)': 0,
 'has(u)': False,
 'count(v)': 0,
 'has(v)': False,
 'count(w)': 0,
 'has(w)': False,
 'count(x)': 0,
 'has(x)': False,
 'count(y)': 0,
 'has(y)': False,
 'count(z)': 0,
 'has(z)': False}

In [43]:
#Observer 1%lower accuracy than above due to overfitting
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
import random
random.shuffle(labeled_names)
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.786


'female'

In [9]:
#Once an initial set of features has been chosen, a very productive method for refining the feature set is error analysis. First, we select a development set, containing the corpus data for creating the model.
#This development set is then subdivided into the training set and the dev-test set.
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

In [10]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.753


In [11]:
#Using the dev-test set, we can generate a list of the errors that the classifier makes when predicting name genders:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )
for (tag, guess, name) in sorted(errors):
    print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))

correct=female   guess=male     name=Aeriell                       
correct=female   guess=male     name=Aidan                         
correct=female   guess=male     name=Alyson                        
correct=female   guess=male     name=Anais                         
correct=female   guess=male     name=Ardis                         
correct=female   guess=male     name=Averyl                        
correct=female   guess=male     name=Babs                          
correct=female   guess=male     name=Berget                        
correct=female   guess=male     name=Bette-Ann                     
correct=female   guess=male     name=Brandais                      
correct=female   guess=male     name=Brear                         
correct=female   guess=male     name=Caitrin                       
correct=female   guess=male     name=Carilyn                       
correct=female   guess=male     name=Carmen                        
correct=female   guess=male     name=Carolin    

In [12]:
#errors makes it clear that some suffixes that are more than one letter can be indicative of name genders


In [13]:
#adjust our feature extractor to include features for two-letter suffixes:
def gender_features(word):
    return {'suffix1': word[-1:],'suffix2': word[-2:]}
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.772


In [14]:
#Document classification - positive or negative
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\maswa\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [15]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features
print(document_features(movie_reviews.words('pos/cv957_8737.txt')))

{'contains(,)': True, 'contains(the)': True, 'contains(.)': True, 'contains(a)': True, 'contains(and)': True, 'contains(of)': True, 'contains(to)': True, "contains(')": True, 'contains(is)': True, 'contains(in)': True, 'contains(s)': True, 'contains(")': True, 'contains(it)': True, 'contains(that)': True, 'contains(-)': True, 'contains())': True, 'contains(()': True, 'contains(as)': True, 'contains(with)': True, 'contains(for)': True, 'contains(his)': True, 'contains(this)': True, 'contains(film)': False, 'contains(i)': False, 'contains(he)': True, 'contains(but)': True, 'contains(on)': True, 'contains(are)': True, 'contains(t)': False, 'contains(by)': True, 'contains(be)': True, 'contains(one)': True, 'contains(movie)': True, 'contains(an)': True, 'contains(who)': True, 'contains(not)': True, 'contains(you)': True, 'contains(from)': True, 'contains(at)': False, 'contains(was)': False, 'contains(have)': True, 'contains(they)': True, 'contains(has)': True, 'contains(her)': False, 'conta

In [16]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.72


In [17]:
classifier.show_most_informative_features(5)

Most Informative Features
        contains(seagal) = True              neg : pos    =     11.6 : 1.0
   contains(outstanding) = True              pos : neg    =     11.2 : 1.0
         contains(mulan) = True              pos : neg    =      8.4 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.8 : 1.0
        contains(wasted) = True              neg : pos    =      6.4 : 1.0


In [18]:
import nltk
from nltk.corpus import brown
suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
print(common_suffixes)

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']


In [47]:
from nltk.corpus import treebank
len(treebank.tagged_sents())

3914

In [50]:
train_data = treebank.tagged_sents()[:3000]
test_data = treebank.tagged_sents()[3000:]
train_data[0]
test_data[0]

[('At', 'IN'),
 ('Tokyo', 'NNP'),
 (',', ','),
 ('the', 'DT'),
 ('Nikkei', 'NNP'),
 ('index', 'NN'),
 ('of', 'IN'),
 ('225', 'CD'),
 ('selected', 'VBN'),
 ('issues', 'NNS'),
 (',', ','),
 ('which', 'WDT'),
 ('*T*-1', '-NONE-'),
 ('gained', 'VBD'),
 ('132', 'CD'),
 ('points', 'NNS'),
 ('Tuesday', 'NNP'),
 (',', ','),
 ('added', 'VBD'),
 ('14.99', 'CD'),
 ('points', 'NNS'),
 ('to', 'TO'),
 ('35564.43', 'CD'),
 ('.', '.')]

In [60]:
from nltk.tag import tnt
tnt_pos_tagger = tnt.TnT()
tnt_pos_tagger.train(train_data)
tnt_pos_tagger.accuracy(test_data)

0.875545003237643

In [56]:
import pickle
f = open('tnt_treebank_pos_tagger.pickle','w')
pickle.dump(tnt_pos_tagger,f)
f.close()

TypeError: write() argument must be str, not bytes

In [59]:
tnt_pos_tagger.tag(nltk.word_tokenize("this is tnt treebank tagger"))

[('this', 'DT'),
 ('is', 'VBZ'),
 ('tnt', 'Unk'),
 ('treebank', 'Unk'),
 ('tagger', 'Unk')]

In [66]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
    return features
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

KeyboardInterrupt: 

In [64]:
classifier.classify(pos_features('cats'))

'female'

In [65]:
print(classifier.pseudocode(depth=4))

AttributeError: 'NaiveBayesClassifier' object has no attribute 'pseudocode'

In [None]:
import nltk
def pos_features(sentence, i):
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
       features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

In [None]:
from nltk.corpus import brown
pos_features(brown.sents()[0], 8)

{'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}

In [None]:
sents = nltk.corpus.treebank_raw.sents()
tokens = []
boundaries = set()
offset = 0
for sent in sents:
    tokens.extend(sent)
    offset += len(sent)
    boundaries.add(offset-1)

In [None]:
def punct_features(tokens, i):
    return {'next-word-capitalized': tokens[i+1][0].isupper(),
            'prev-word': tokens[i-1].lower(),
            'punct': tokens[i],
            'prev-word-is-one-char': len(tokens[i-1]) == 1}

In [None]:
featuresets = [(punct_features(tokens, i), (i in boundaries))
                for i in range(1, len(tokens)-1)
                if tokens[i] in '.?!']

In [None]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.936026936026936

In [None]:
import nltk
from nltk.tokenize import sent_tokenize

In [None]:
text1 = '''
This is a sentence.
We are happy!
This is another sentence
Are you happy?
This is Mr. John.
He lives in U.K.
He earned $8.34.
We're happy again.
'''

In [None]:
for sentence in sent_tokenize(text1):
    print(sentence)


This is a sentence.
We are happy!
This is another sentence
Are you happy?
This is Mr. John.
He lives in U.K.
He earned $8.34.
We're happy again.


In [None]:
#Test set - train and test set are simlar here from news
import random
from nltk.corpus import brown
tagged_sents = list(brown.tagged_sents(categories='news'))
random.shuffle(tagged_sents)
size = int(len(tagged_sents) * 0.1)
train_set, test_set = tagged_sents[size:], tagged_sents[:size]

In [None]:
#Better than above as train and test are taken from different documents
file_ids = brown.fileids(categories='news')
size = int(len(file_ids) * 0.1)
train_set = brown.tagged_sents(file_ids[size:])
test_set = brown.tagged_sents(file_ids[:size])

In [None]:
#If we want to perform a more stringent evaluation, we can draw the test set from documents that are less closely related to those in the training set
train_set = brown.tagged_sents(categories='news')
test_set = brown.tagged_sents(categories='fiction')

In [None]:
#accuracy, measures the percentage of inputs in the test set that the classifier correctly labeled.
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

ValueError: too many values to unpack (expected 2)

In [None]:
import nltk
from nltk.tokenize import word_tokenize
# Trainng data
train = [('I love this sandwich.', 'pos'),
         ('This is an amazing place!', 'pos'),
         ('I feel very good about these beers.', 'pos'),
         ('This is my best work.', 'pos'),
         ("What an awesome view", 'pos'),
         ('I do not like this restaurant', 'neg'),
         ('I am tired of this stuff.', 'neg'),
         ("I can't deal with this", 'neg'),
         ('He is my sworn enemy!', 'neg'),
         ('My boss is horrible.', 'neg') ]

# Test data
test = [('The beer was good.', 'pos'),
        ('I do not enjoy my job', 'neg'),
        ("I ain't feeling dandy today.", 'neg'),
        ("I feel amazing!", 'pos'),
        ('Gary is a friend of mine.', 'pos'),
        ("I can't believe I'm doing this.", 'neg') ]


# Tokenize Training words
Training_words = set(word.lower() for passage in train for word in word_tokenize(passage[0]))

# Training feature sets
training_set = [({word: (word in word_tokenize(x[0])) for word in Training_words}, x[1]) for x in train]

# Tokenize Test words
Test_words = set(word.lower() for passage in test for word in word_tokenize(passage[0]))

# Test feature sets
test_set= [({word: (word in word_tokenize(x[0])) for word in Test_words}, x[1]) for x in test]

# Naive Bayes classifier
classifier = nltk.NaiveBayesClassifier.train(training_set)

# Informative Features
classifier.show_most_informative_features()

# print the accuracy
print("accuracy %",(nltk.classify.accuracy(classifier, test_set))*100)

Most Informative Features
                    this = True              neg : pos    =      2.3 : 1.0
                    this = False             pos : neg    =      1.8 : 1.0
                      an = False             neg : pos    =      1.6 : 1.0
                       . = False             neg : pos    =      1.4 : 1.0
                       . = True              pos : neg    =      1.4 : 1.0
                   about = False             neg : pos    =      1.2 : 1.0
                      am = False             pos : neg    =      1.2 : 1.0
                 amazing = False             neg : pos    =      1.2 : 1.0
                 awesome = False             neg : pos    =      1.2 : 1.0
                   beers = False             neg : pos    =      1.2 : 1.0
accuracy % 83.33333333333334
