In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# 영문 이름으로 성별 예측
- NLTK(national language toolkit) 패키지 이용
- NLTK : 교육용으로 개발된 언어 처리 및 문서 분석용 패키지
- 한국어 모듈도 있음

In [2]:
from nltk.corpus import names
import nltk
nltk.download('names')

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\618-01\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


True

In [3]:
type(names)

nltk.corpus.util.LazyCorpusLoader

In [4]:
names.words()

['Abagael',
 'Abagail',
 'Abbe',
 'Abbey',
 'Abbi',
 'Abbie',
 'Abby',
 'Abigael',
 'Abigail',
 'Abigale',
 'Abra',
 'Acacia',
 'Ada',
 'Adah',
 'Adaline',
 'Adara',
 'Addie',
 'Addis',
 'Adel',
 'Adela',
 'Adelaide',
 'Adele',
 'Adelice',
 'Adelina',
 'Adelind',
 'Adeline',
 'Adella',
 'Adelle',
 'Adena',
 'Adey',
 'Adi',
 'Adiana',
 'Adina',
 'Adora',
 'Adore',
 'Adoree',
 'Adorne',
 'Adrea',
 'Adria',
 'Adriaens',
 'Adrian',
 'Adriana',
 'Adriane',
 'Adrianna',
 'Adrianne',
 'Adrien',
 'Adriena',
 'Adrienne',
 'Aeriel',
 'Aeriela',
 'Aeriell',
 'Ag',
 'Agace',
 'Agata',
 'Agatha',
 'Agathe',
 'Aggi',
 'Aggie',
 'Aggy',
 'Agna',
 'Agnella',
 'Agnes',
 'Agnese',
 'Agnesse',
 'Agneta',
 'Agnola',
 'Agretha',
 'Aida',
 'Aidan',
 'Aigneis',
 'Aila',
 'Aile',
 'Ailee',
 'Aileen',
 'Ailene',
 'Ailey',
 'Aili',
 'Ailina',
 'Ailyn',
 'Aime',
 'Aimee',
 'Aimil',
 'Aina',
 'Aindrea',
 'Ainslee',
 'Ainsley',
 'Ainslie',
 'Ajay',
 'Alaine',
 'Alameda',
 'Alana',
 'Alanah',
 'Alane',
 'Alanna',
 

In [5]:
len(names.words())

7944

In [6]:
names.words('male.txt')[:10] # male names

['Aamir',
 'Aaron',
 'Abbey',
 'Abbie',
 'Abbot',
 'Abbott',
 'Abby',
 'Abdel',
 'Abdul',
 'Abdulkarim']

In [7]:
names.words('female.txt')[:5]

['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi']

In [8]:
labeled_names = [(name,'남자') for name in names.words('male.txt')] + \
                [(name,'여자') for name in names.words('female.txt')]
labeled_names[:10]

[('Aamir', '남자'),
 ('Aaron', '남자'),
 ('Abbey', '남자'),
 ('Abbie', '남자'),
 ('Abbot', '남자'),
 ('Abbott', '남자'),
 ('Abby', '남자'),
 ('Abdel', '남자'),
 ('Abdul', '남자'),
 ('Abdulkarim', '남자')]

In [9]:
import random
random.shuffle(labeled_names)

In [10]:
labeled_names[:10] # shuffled name

[('Yolanda', '여자'),
 ('Kristel', '여자'),
 ('Moira', '여자'),
 ('Guthrie', '남자'),
 ('Woodie', '남자'),
 ('Edita', '여자'),
 ('Shadow', '남자'),
 ('Malissia', '여자'),
 ('Belicia', '여자'),
 ('Bernete', '여자')]

In [11]:
# 이름의 마지막 알파벳을 특성으로 사용 (이게 성별과 가장 연관이 깊다고 알려짐)
def gender_feature(word):
    return  { 'last_letter' : word[-1]}
gender_feature('sopi')

{'last_letter': 'i'}

In [12]:
featuresets = [(gender_feature(n),gender) for (n,gender) in labeled_names]
featuresets[:5]

[({'last_letter': 'a'}, '여자'),
 ({'last_letter': 'l'}, '여자'),
 ({'last_letter': 'a'}, '여자'),
 ({'last_letter': 'e'}, '남자'),
 ({'last_letter': 'e'}, '남자')]

In [13]:
len(featuresets)

7944

In [14]:
train_set, test_set = featuresets[2000:], featuresets[:2000]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [15]:
classifier.classify(gender_feature('Sophia'))

'여자'

In [16]:
classifier.classify(gender_feature('Yunji'))

'여자'

In [17]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'                여자 : 남자     =     35.6 : 1.0
             last_letter = 'k'                남자 : 여자     =     35.4 : 1.0
             last_letter = 'f'                남자 : 여자     =     14.5 : 1.0
             last_letter = 'd'                남자 : 여자     =     10.9 : 1.0
             last_letter = 'p'                남자 : 여자     =      8.4 : 1.0


In [18]:
nltk.classify.accuracy(classifier,test_set)

0.748

## 다양한 특성 사용

In [19]:
def gender_feature2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    features["length"] = len(name)
    
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        
    return features
gender_feature('Josssshua')

{'last_letter': 'a'}

In [20]:
featuresets

[({'last_letter': 'a'}, '여자'),
 ({'last_letter': 'l'}, '여자'),
 ({'last_letter': 'a'}, '여자'),
 ({'last_letter': 'e'}, '남자'),
 ({'last_letter': 'e'}, '남자'),
 ({'last_letter': 'a'}, '여자'),
 ({'last_letter': 'w'}, '남자'),
 ({'last_letter': 'a'}, '여자'),
 ({'last_letter': 'a'}, '여자'),
 ({'last_letter': 'e'}, '여자'),
 ({'last_letter': 'z'}, '남자'),
 ({'last_letter': 'y'}, '여자'),
 ({'last_letter': 'a'}, '여자'),
 ({'last_letter': 'e'}, '여자'),
 ({'last_letter': 'd'}, '남자'),
 ({'last_letter': 'e'}, '여자'),
 ({'last_letter': 'y'}, '남자'),
 ({'last_letter': 'e'}, '여자'),
 ({'last_letter': 'e'}, '여자'),
 ({'last_letter': 'l'}, '남자'),
 ({'last_letter': 'n'}, '남자'),
 ({'last_letter': 'e'}, '여자'),
 ({'last_letter': 'o'}, '남자'),
 ({'last_letter': 'm'}, '남자'),
 ({'last_letter': 'h'}, '여자'),
 ({'last_letter': 'e'}, '여자'),
 ({'last_letter': 'e'}, '여자'),
 ({'last_letter': 'a'}, '여자'),
 ({'last_letter': 'e'}, '여자'),
 ({'last_letter': 'l'}, '여자'),
 ({'last_letter': 'i'}, '여자'),
 ({'last_letter': 'r'}, '남자'),
 ({'last

In [21]:
labeled_names[:10]

[('Yolanda', '여자'),
 ('Kristel', '여자'),
 ('Moira', '여자'),
 ('Guthrie', '남자'),
 ('Woodie', '남자'),
 ('Edita', '여자'),
 ('Shadow', '남자'),
 ('Malissia', '여자'),
 ('Belicia', '여자'),
 ('Bernete', '여자')]

In [22]:
len(featuresets)

7944

In [23]:
featuresets = [(gender_feature2(n),gender) for (n,gender) in labeled_names]
featuresets[:3]

[({'first_letter': 'y',
   'last_letter': 'a',
   'length': 7,
   'count(a)': 2,
   'count(b)': 0,
   'count(c)': 0,
   'count(d)': 1,
   'count(e)': 0,
   'count(f)': 0,
   'count(g)': 0,
   'count(h)': 0,
   'count(i)': 0,
   'count(j)': 0,
   'count(k)': 0,
   'count(l)': 1,
   'count(m)': 0,
   'count(n)': 1,
   'count(o)': 1,
   'count(p)': 0,
   'count(q)': 0,
   'count(r)': 0,
   'count(s)': 0,
   'count(t)': 0,
   'count(u)': 0,
   'count(v)': 0,
   'count(w)': 0,
   'count(x)': 0,
   'count(y)': 1,
   'count(z)': 0},
  '여자'),
 ({'first_letter': 'k',
   'last_letter': 'l',
   'length': 7,
   'count(a)': 0,
   'count(b)': 0,
   'count(c)': 0,
   'count(d)': 0,
   'count(e)': 1,
   'count(f)': 0,
   'count(g)': 0,
   'count(h)': 0,
   'count(i)': 1,
   'count(j)': 0,
   'count(k)': 1,
   'count(l)': 1,
   'count(m)': 0,
   'count(n)': 0,
   'count(o)': 0,
   'count(p)': 0,
   'count(q)': 0,
   'count(r)': 1,
   'count(s)': 1,
   'count(t)': 1,
   'count(u)': 0,
   'count(v)': 0,


In [24]:
train_set, test_set = featuresets[-2000:], featuresets[-2000:]
classifer = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier,test_set)

0.7655

In [25]:
classifier.show_most_informative_features(20)

Most Informative Features
             last_letter = 'a'                여자 : 남자     =     35.6 : 1.0
             last_letter = 'k'                남자 : 여자     =     35.4 : 1.0
             last_letter = 'f'                남자 : 여자     =     14.5 : 1.0
             last_letter = 'd'                남자 : 여자     =     10.9 : 1.0
             last_letter = 'p'                남자 : 여자     =      8.4 : 1.0
             last_letter = 'o'                남자 : 여자     =      8.0 : 1.0
             last_letter = 'r'                남자 : 여자     =      7.9 : 1.0
             last_letter = 'm'                남자 : 여자     =      7.3 : 1.0
             last_letter = 'v'                남자 : 여자     =      7.1 : 1.0
             last_letter = 'z'                남자 : 여자     =      5.7 : 1.0
             last_letter = 'w'                남자 : 여자     =      5.1 : 1.0
             last_letter = 'g'                남자 : 여자     =      4.7 : 1.0
             last_letter = 's'                남자 : 여자     =      4.5 : 1.0

# 마지막 두 레터로 구분

In [26]:
def gender_feature3(word):
    return{'suffix1':word[-1:10],
          'suffix2':word[-2:10],
          }

In [27]:
train_names = labeled_names[2000:] # 뒤 2000개
test_names = labeled_names[:2000] # 앞 2000개
train_set = [(gender_feature3(n),gender) for (n,gender) in train_names]
test_set = [(gender_feature3(n),gender) for (n,gender) in test_names]

In [28]:
train_set[:5]

[({'suffix1': 'a', 'suffix2': 'ra'}, '여자'),
 ({'suffix1': 'a', 'suffix2': 'na'}, '여자'),
 ({'suffix1': 'a', 'suffix2': 'ta'}, '여자'),
 ({'suffix1': 'e', 'suffix2': 'te'}, '여자'),
 ({'suffix1': 'e', 'suffix2': 'ie'}, '여자')]

In [29]:
classifer = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier,test_set)

0.634

In [42]:
train_set[:5]

[({'suffix1': 'a', 'suffix2': 'ra'}, '여자'),
 ({'suffix1': 'a', 'suffix2': 'na'}, '여자'),
 ({'suffix1': 'a', 'suffix2': 'ta'}, '여자'),
 ({'suffix1': 'e', 'suffix2': 'te'}, '여자'),
 ({'suffix1': 'e', 'suffix2': 'ie'}, '여자')]

In [40]:
classifier.classify(gender_feature('yunji'))

'여자'