In [1]:
# 监督学习：特征提取，语言模型的构建
# 决策树，朴素贝叶斯分类器，最大熵分类

In [2]:
def gender_features(word):
    '''
    特征提取生成器
    '''
    return {'last_letter':word[-1]}

In [3]:
# 验证特征提取生成函数
gender_features('Shrek')
# 这个函数返回的字典被称为特征集，映射特征名称到它们的值。特征名称是区分大小写的字符串，特征值是简单类型的值，如布尔，数字和字符串。

{'last_letter': 'k'}

In [4]:
from nltk.corpus import names
import nltk
import random

In [5]:
names = ([(name,'male') for name in names.words('male.txt')]+
        [(name,'female') for name in names.words('female.txt')])

In [6]:
random.shuffle(names)  # 将序列的所有元素随机排序。

In [7]:
# 获取训练集和测试集
featuresets = [(gender_features(n),g) for (n,g) in names]
train_set, test_set = featuresets[500:],featuresets[:500]

In [8]:
# 训练
classifer = nltk.NaiveBayesClassifier.train(train_set)

In [9]:
# 测试
classifer.classify(gender_features('Neo'))

'male'

In [10]:
classifer.classify(gender_features('Trinity'))

'female'

In [11]:
# 在测试集上验证数据的准确度
print(nltk.classify.accuracy(classifer, test_set))

0.776


In [12]:
# 显示前5的 最大似然比，确定哪些特征对于区分名字的性别是最有效的。
classifer.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'            female : male   =     33.0 : 1.0
             last_letter = 'k'              male : female =     31.8 : 1.0
             last_letter = 'f'              male : female =     17.3 : 1.0
             last_letter = 'p'              male : female =     11.2 : 1.0
             last_letter = 'v'              male : female =     10.5 : 1.0


In [13]:
# ===============================结束====================

In [14]:
# 改进：
# 防止语料的链表过大，导致内存不足，通过nltk.classify.apply_features返回类似生成器的对象。
from nltk.classify import apply_features
# apply_features()的两个参数分别为特征提取函数，和数据集
train_set = apply_features(gender_features, names[500:])
test_set = apply_features(gender_features, names[:500])

In [15]:
# 特征选取，特征编码

In [16]:
# 例子：一个特征提取器，过拟合性别特征。
# 这个特征提取器返回的特征集包括大量的指定的特征，从而导致对于相对较小的名字语料库过拟合。
def gender_features2(name):
    features = {}
    features['firstletter'] = name[0].lower()
    features['lastletter'] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features['count(%s)'%letter] = name.lower().count(letter)
        features['has(%s)'%letter] = (letter in name.lower())
    return features

In [17]:
# 验证特征提取器结果
gender_features2('John')

{'count(a)': 0,
 'count(b)': 0,
 'count(c)': 0,
 'count(d)': 0,
 'count(e)': 0,
 'count(f)': 0,
 'count(g)': 0,
 'count(h)': 1,
 'count(i)': 0,
 'count(j)': 1,
 'count(k)': 0,
 'count(l)': 0,
 'count(m)': 0,
 'count(n)': 1,
 'count(o)': 1,
 'count(p)': 0,
 'count(q)': 0,
 'count(r)': 0,
 'count(s)': 0,
 'count(t)': 0,
 'count(u)': 0,
 'count(v)': 0,
 'count(w)': 0,
 'count(x)': 0,
 'count(y)': 0,
 'count(z)': 0,
 'firstletter': 'j',
 'has(a)': False,
 'has(b)': False,
 'has(c)': False,
 'has(d)': False,
 'has(e)': False,
 'has(f)': False,
 'has(g)': False,
 'has(h)': True,
 'has(i)': False,
 'has(j)': True,
 'has(k)': False,
 'has(l)': False,
 'has(m)': False,
 'has(n)': True,
 'has(o)': True,
 'has(p)': False,
 'has(q)': False,
 'has(r)': False,
 'has(s)': False,
 'has(t)': False,
 'has(u)': False,
 'has(v)': False,
 'has(w)': False,
 'has(x)': False,
 'has(y)': False,
 'has(z)': False,
 'lastletter': 'n'}

In [18]:
# 给定的学习算法的特征的数目是有限的--如果你提供太多的特征，那么该算法将高度依赖你的训练数据的特征，在一般化的例子的性能效果不会太好。
# 产生 过拟合 的问题。

# 使用gender_feature2(name)的特征提取器的结果作为input X，将会过拟合这个相对较小的训练集，
# 造成这个系统的精度比只考虑每个名字的最后一个字母的分类器的精度低约1%


In [19]:
featuresets = [(gender_features2(n),g) for (n,g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier2 = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier2, test_set))

0.784


In [20]:
# 结果还是0.78,说明此处的 朴素贝叶斯分类器的特征过程的设置比书中的例子发生了变化

In [21]:
# 错误分析 -- 完善特征集的一个非常有成效的方法。

In [22]:
train_names = names[1500:]  # 训练集：训练模型
devtest_names = names[500:1500]  # 验证集：进行错误分析
test_names = names[:500]  # 测试集：系统的最终评估

In [23]:
train_set = [(gender_features(n),g) for (n,g) in train_names]
devtest_set = [(gender_features(n),g) for (n,g) in devtest_names]
test_set = [(gender_features(n),g) for (n,g) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.748


In [24]:
errors = []
for (name,tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append((tag,guess,name))

In [25]:
for (tag,guess,name) in sorted(errors):
    # 打印错误列表
    print('correct = {} guess = {} name ={}'.format(tag,guess,name))

correct = female guess = male name =Ag
correct = female guess = male name =Allis
correct = female guess = male name =Arden
correct = female guess = male name =Ardys
correct = female guess = male name =Astrid
correct = female guess = male name =Berget
correct = female guess = male name =Bess
correct = female guess = male name =Bren
correct = female guess = male name =Brooks
correct = female guess = male name =Carmon
correct = female guess = male name =Carolin
correct = female guess = male name =Caron
correct = female guess = male name =Cathrin
correct = female guess = male name =Celestyn
correct = female guess = male name =Charlott
correct = female guess = male name =Charmion
correct = female guess = male name =Cherilyn
correct = female guess = male name =Chloris
correct = female guess = male name =Christan
correct = female guess = male name =Clair
correct = female guess = male name =Clio
correct = female guess = male name =Cristin
correct = female guess = male name =Danit
correct = fem

In [26]:
# 查看上述的错误列表，发现yn结尾的名字显示以女性为主，尽管事实上，n结尾的名字往往是男性；
# 以ch结尾的名字通常是男性，尽管以h结尾的名字倾向于是女性。
# 所以调整我们的特征提取器包括两个字母后缀的特征。

In [27]:
def gender_features(word):
    '''提取最后一个后缀，和最后两个后缀作为特征的特征提取器'''
    return {'suffix1':word[-1:],'suffix2':word[-2:]}

In [28]:
train_set = [(gender_features(n),g) for (n,g) in train_names]
devtest_set = [(gender_features(n),g) for (n,g) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))  # 从0.759提升到0.771

0.77
