In [1]:
from rnn import Rnn
import numpy as np
import random

# note: use python2, not 3

In [2]:
class NameRnn(Rnn):
    '''
    RNN which learns from a list of names
    '''

    def __init__(self):
        # note default is names.txt, I made names2.txt
        super(NameRnn, self).__init__('names2.txt')
        self.minimum_name_length = 3
        self.maximum_name_length = 12
        self.iterations_per_log = 1000

    def validate(self,name):
        '''Validate that the first char and length are appropriate'''
        return (self.is_letter(name[0]) and self.is_acceptable_length(name))

    def is_letter(self, character):
        return ord(character) in range(65,90)

    def is_acceptable_length(self, name):
        return self.meets_maximum_length(name) and self.meets_minimum_length(name)

    def meets_minimum_length(self, name):
        return len(name) >= self.minimum_name_length

    def meets_maximum_length(self, name):
        return len(name) <= self.maximum_name_length

    def get(self,num):
        '''Gets a list of generated names'''
        names = []
        while len(names) < num:
            start_char_id = random.randint(0, len(self.char_to_ix)-1)
            start = self.ix_to_char[start_char_id]

            self.hprev = np.random.randn(len(self.hprev), 1)
            sample_ix = self.sample(self.char_to_ix[start], 30, training=False)
            txt = ''.join(self.ix_to_char[ix] for ix in sample_ix)

            # Clean up
            for name in txt.split():
                if self.validate(name):
                    names.append(name.capitalize())
        self.saveParameters()
        return names

    def print_names(self, rows, columns):
        for name in [self.get(columns) for i in range(rows)]:
            out = ''
            for x in name:
                out += x.ljust(15)
            print(out)

    def step(self,p):
        '''Does the heavy lifting'''
        smooth_loss = -np.log(1.0/self.vocab_size)*self.seq_length # loss at iteration 0
        # prepare inputs (we're sweeping from left to right in steps seq_length long)
        if p+self.seq_length+1 >= len(self.data):
            self.hprev = np.zeros((self.hidden_size,1)) # reset RNN memory
            p = 0 # go from start of data

        inputs = [self.char_to_ix[ch] for ch in self.data[p:p+self.seq_length]]
        targets = [self.char_to_ix[ch] for ch in self.data[p+1:p+self.seq_length+1]]

        # forward seq_length characters through the net and fetch gradient
        loss, dWxh, dWhh, dWhy, dbh, dby = self.lossFun(inputs, targets)
        smooth_loss = smooth_loss * 0.999 + loss * 0.001

        # perform parameter update with Adagrad
        for param, dparam, mem in zip([self.Wxh, self.Whh, self.Why, self.bh, self.by],
                [dWxh, dWhh, dWhy, dbh, dby],
                [self.mWxh, self.mWhh, self.mWhy, self.mbh, self.mby]):
            mem += dparam * dparam
            param += -self.learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

        next_ind = p + self.data[p:p+self.seq_length].index(' ')+1 # move data pointer
        #print(self.data[p:next_ind])
        return next_ind

In [3]:
x = NameRnn()

data has 8193 characters, 28 of which are unique.
Loaded previous results


In [4]:
x.train(num_iterations=100000)

----
 UARLUS
MAHNAY
JENARTON
ELRICK
KIFETON
SHESJEHNTH
LANLOY
HAURIBEN
DOMY
ALDOND
RONDY
PERRIS
ISEY
BERTON
DANCAS
CKRARO
K
EVIN
SHETHAN
ALLANDSEL
JAY
JOHDORA
VEN
CHARTANSEOD
WILVAR
CEAN
TILED
HELTON
OLIUS 
----
0.0% of process completed
----
 CORARDREY
FROREY
WILDOS
ARUDLY
EDY
AURE
WILTREY
ARINCLENDEON
VINNY
VIN
RIGLARL
KROBHAEL
FREGALD
ARERBURT
JUDLANED
JANNENTO
SHAS
BRUDE
BREMBIAY
KILMILEO
WARNEL
TOBEFREM
DALSIACERON
QURED
AKIERUICLY
TO 
----
1.0% of process completed
----
 WILFCEDRICADY
LEM
STUSWARD
ROUAL
LICURSAN
SEAN
LOGE
DOND
KARLIOTON
CORRO
OSVIL
JETY
LAMILESIA
GLAQUESTER
WAMON
DIROR
CLICKE
VAEREL
EWMIE
NASTEVAN
HENMOD
EREE
NINSO
ALBERT
DAUREY
ETSEXAN
DARDILLO
FUREL 
----
2.0% of process completed
----
 ITT
PERREY
SHES
ALEXWALLOTHATIAN
RECOL
DANTO
FRALEDEN
GONNITO
DAHCHIOD
TYE
DAWMONY
BUNE
KEVICO
ALLIE
EMARG
WILPH
LEXIELE
REDLUIS
ERIMO
ALVHON
CLUFRELOLEW
ISE
AGEDENEY
BERWEL
JODARSAN
MELSA
EDY
ED
LAND 
----
3.0% of process completed
----
 IN
WILD
TALIN
MITCARY
JOLAN
RELIAK
LIMIT

----
 E
JOSE
RADY
QUINCE
FEDAL
SHAUG
LINCBEREY
MAN
EROWALE
DARTON
DELEZO
MORRWARDRIO
JAMEL
CARONN
RIS
GURDREO
AEDRY
CORUS
DURLDELL
JARNUOL
ORCERDEN
LOREN
DIS
HADLY
GAYNLORRICK
REBFLE
REDOYN
RITORAN
BRANTELI 
----
35.0% of process completed
----
 STOD
MIDEN
HELBERTE
ROBBIAR
JOYTT
LAGILACEN
FREUTOBTO
THAMY
CERMY
BRENKRY
EVEZ
RANG
BRICKIE
NALDO
SANBY
KEL
JAMAOHE
DONELL
JERRON
DELISTON
EDRICAEL
WIEN
TON
ZACY
SON
RAMIAS
KENTONY
COLLBERED
LOUSTIN 
----
36.0% of process completed
----
 ERON
HUMONED
JULAMELIE
ALEN
FRORANTON
FEREY
CHELBOS
BAUG
FERIKA
ISSE
ORANDYROD
KEN
ADFIERCAMINE
HARVING
KISAOLL
ELMILINCERAMETHEL
LAND
RAYDE
DYLE
O
DOMMANDO
ERFRED
JEYT
DARGUAY
BURTO
MAOMER
EDMARY
BRY 
----
37.0% of process completed
----
 CORREN
RIDWITCHERNO
OLEN
GERRISTON
DEVINISER
MALVER
RACKIS
JEGRIN
WARCALEPOTRO
HOSTON
SHATH
JAVER
ELIVIL
JEROB
DERY
COLE
GARIAL
HANDRINK
ELLUS
BRANTAL
LYOSSETT
AYLANT
ZACY
DUNDIO
INTER
CLEY
WILBERE
RE 
----
38.0% of process completed
----
 ON
PERRTI
JOREY
FOEDRY
KIA
BRODOLL
J

----
 EL
ESTAN
MARDRELDO
SENSES
ESONCIAN
STOS
JAY
GORE
RECUREL
RAWIL
RICH
HARVIER
MENN
GILE
WILDORDETRACBY
DING
ALIOL
LAVIE
WOUS
NUSTE
DOWIEN
TIMIL
JARVIA
CLAUD
JERRICK
MASVIN
DOUDG
MANED
EMVINITC
DAND
HAUS 
----
70.0% of process completed
----
 UM
CLINAS
GRADY
GRONUE
BONSEL
MARTO
GLIE
CLAUR
BEDETLAND
MERSON
DARRRONQXISCLO
GUYL
PHINM
RAYDIGALL
LEWGERROY
NICK
LEOS
ADOLPH
LUALL
ODEERT
DAMEBIO
HOSANG
TRAPT
DOND
SAED
BROSUGOLTADFRESTEVIO
LADESSAN 
----
71.0% of process completed
----
 O
ARDDIRIO
NEYTHE
DOMAEL
MAHLABIAS
ERNIDO
QUINTON
GAOUL
SHEN
FLANT
ROS
MASSE
LEE
ASES
INARL
BRONE
GAANICK
KEYT
DOSRE
NICKEL
RABY
RONEFOES
BISTE
NUDERY
KATHAUCES
MAHXEPHEL
ROMIL
VEROT
DOMINE
ARRYELL
RU 
----
72.0% of process completed
----
 ER
FRAMIDRIE
JEFFREBENICASO
STEVER
PORD
ARRED
JEMUNTIS
ALIN
OMEN
ALWECOLLO
ALFRRENTO
WALPHERSTAN
OGILINO
BEERRICK
LUFEL
JORDSEERY
PHILB
TRESLEM
TUINE
LEGEG
DUMIL
JUMEFER
JOSNEY
WILSODO
IPE
EDRUGL
ORTO 
----
73.0% of process completed
----
 GORALAYMOS
KEVIN
BRUNCO
VALEINO
DE

In [11]:
for ii in range(100):
    print(x.randomSample(30))

UART
JOWCELUKLIE
CTON
JAD
EMUR
NDE
ALFY
RACK
SAND
DONK
REGLY
SVARRETIN
JESTON
RAMODIS
CHATT
ELARD
DAUG
NARMON
BRIODIVON
CA
ELFREY
FREGLON
GRAND
DAUSRAN
J
ASE
WASSAN
TANTED
MATHAN
ALDE
IKEO
GORDAR
JERTY
LAYMAN
HUDED
EY
KENTON
SIYDON
DARON
DANFORD
ASHAN
ARWALBY
RUTMISAN
BREVYL
ACHIL
BRAMICK
LEN
HANSE
LASAN
MAR
JOY
TADERAN
FREYVE
ORMAS
E
LINGOY
HEOS
MOLLER
SEE
FILEEFR
UD
CORNY
SAN
STEZ
KAREY
LACURW
EERATH
MRIAMORS
FERRY
TYSAUL
B
RORDEN
EMY
JOMIS
MICHALEJOAL
A
CHOMANBYRMAN
ANER
BREGFORD
RU
ALITWINIO
MARVIN
KURO
S
LALDLA
GEX
AS
TYAN
JORQUES
JONAHAUNA
LICHANDO
BUDGO
WILDAM
JOO
ANCE
ARRA
BARRRY
ELIN
TRECURLAN
WIL
LVILVEW
CHULLIO
TUELO
STAM
KER
ABLALIO
MARSANNIS
LIM
SON
ANCE
LET
TO
TRISTIS
MAXAN
RAUMY
SAN
LLMAWIT
RICKE
ROLE
FRRAS
ANMAN
ON
ESCHELLIELO
SOMATHAN
BRES
B
UIO
FRESTEVON
KANDY
QUISTER
GE
RCLESSERTO
HENCALEO
KRALLIS
LO
LEED
DEM
FER
DIEN
SHAS
BRYAN
D
LLIAN
DELFRE
GALER
JASSY
REARL
IPE
ANDON
S
BUDUSCHSTUEL
ERIPH
AYN
WALVIN
TONNAY
ELMUCK
ZEQUI
HANG
GURAN
OYLY
BRAEL
RAUDIL
W
AMAN
NEY
CORNE

In [5]:
import pandas as pd

In [19]:
aa = np.array(np.transpose(pd.read_csv("names.txt", sep = " ", header = None, )))
aa[0:10, :]

array([['Aache'],
       ['Aanwas'],
       ['Aaron'],
       ['Abaet'],
       ['Abarden'],
       ['Abbadon'],
       ['Abbe'],
       ['Abbo'],
       ['Abe'],
       ['Aberbysion']], dtype=object)

In [21]:
"Sarg" in  aa

False

In [45]:
# read in names and output it in the same format as names.txt
bb = pd.read_table("dist.male.first.txt", header = None, delimiter=r"\s+")

In [46]:
bb.head()

Unnamed: 0,0,1,2,3
0,JAMES,3.318,3.318,1
1,JOHN,3.271,6.589,2
2,ROBERT,3.143,9.732,3
3,MICHAEL,2.629,12.361,4
4,WILLIAM,2.451,14.812,5


In [44]:
bb.loc[1,:]

0    JOHN           3.271  6.589      2
Name: 1, dtype: object

In [47]:
pd.read_csv("names.txt", sep = " ", header = None, )

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4102,4103,4104,4105,4106,4107,4108,4109,4110,4111
0,Aache,Aanwas,Aaron,Abaet,Abarden,Abbadon,Abbe,Abbo,Abe,Aberbysion,...,Woon,Worf,Wotan,Wrall,Wrathran,Wraythe,Wrothag,Wulf,Wulfgrim,Wuthmon


In [50]:
cc = bb.loc[:, 0]

In [70]:
dd = pd.DataFrame(cc.values.reshape([1, len(bb)]))

In [72]:
dd.to_csv("names2.txt", sep = " ", header = None, index = False)