In [1]:
from rnn import Rnn
import numpy as np
import random

# note: use python2, not 3

In [28]:
class NameRnn(Rnn):
    '''
    RNN which learns from a list of names
    '''

    def __init__(self):
        # note default is names.txt, I made names2.txt
        super(NameRnn, self).__init__('names2.txt')
        self.minimum_name_length = 3
        self.maximum_name_length = 12
        self.iterations_per_log = 1000

    def validate(self,name):
        '''Validate that the first char and length are appropriate'''
        return (self.is_letter(name[0]) and self.is_acceptable_length(name))

    def is_letter(self, character):
        return ord(character) in range(65,90)

    def is_acceptable_length(self, name):
        return self.meets_maximum_length(name) and self.meets_minimum_length(name)

    def meets_minimum_length(self, name):
        return len(name) >= self.minimum_name_length

    def meets_maximum_length(self, name):
        return len(name) <= self.maximum_name_length

    def get(self,num):
        '''Gets a list of generated names'''
        names = []
        while len(names) < num:
            start_char_id = random.randint(0, len(self.char_to_ix)-1)
            start = self.ix_to_char[start_char_id]

            self.hprev = np.random.randn(len(self.hprev), 1)
            sample_ix = self.sample(self.char_to_ix[start], 30, training=False)
            txt = ''.join(self.ix_to_char[ix] for ix in sample_ix)

            # Clean up
            for name in txt.split():
                if self.validate(name):
                    names.append(name.capitalize())
        self.saveParameters()
        return names

    def print_names(self, rows, columns):
        for name in [self.get(columns) for i in range(rows)]:
            out = ''
            for x in name:
                out += x.ljust(15)
            print(out)

    def step(self,p):
        '''Does the heavy lifting'''
        smooth_loss = -np.log(1.0/self.vocab_size)*self.seq_length # loss at iteration 0
        # prepare inputs (we're sweeping from left to right in steps seq_length long)
        if p+self.seq_length+1 >= len(self.data):
            self.hprev = np.zeros((self.hidden_size,1)) # reset RNN memory
            p = 0 # go from start of data

        inputs = [self.char_to_ix[ch] for ch in self.data[p:p+self.seq_length]]
        targets = [self.char_to_ix[ch] for ch in self.data[p+1:p+self.seq_length+1]]

        # forward seq_length characters through the net and fetch gradient
        loss, dWxh, dWhh, dWhy, dbh, dby = self.lossFun(inputs, targets)
        smooth_loss = smooth_loss * 0.999 + loss * 0.001

        # perform parameter update with Adagrad
        for param, dparam, mem in zip([self.Wxh, self.Whh, self.Why, self.bh, self.by],
                [dWxh, dWhh, dWhy, dbh, dby],
                [self.mWxh, self.mWhh, self.mWhy, self.mbh, self.mby]):
            mem += dparam * dparam
            param += -self.learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

        next_ind = p + self.data[p:p+self.seq_length].index(' ')+1 # move data pointer
        #print(self.data[p:next_ind])
        return next_ind

In [29]:
x = NameRnn()

data has 8193 characters, 28 of which are unique.
expected bytes-like object, not str


In [30]:
x.train(num_iterations=100000)

----
 CEWBFJQQDVOTC
I
NAYFMVWGSSP
PMEAMZKKPEWPTHWD
IY
ZBLBHTYDWHAEGFJL
SGGJEJIZTOT
CQZSOZ
OUNKVHBKQJRFFMSCSLLHQGJSAAOILNGGQJAYYBDYZSPNKIJT
KQTQWROPGPEHA
CSORQIBSTJZCAJTBZOFBTNN
LSZFFZXOBUGZAVOJYSVBIAMXW
GB 
----
0.0% of process completed
----
 PEDTORO
FIRLUG
DOLEL
SODEEL
JAPEY
T
RACRERLOCEL
RUNTARFY
NCEE
MUCOL
TONTE
RUSTOY
TRAQTROT
SONDIN
BENTONE
CULANIRO
DISTELLOND
BUOD
BANMONNY
ERRI
LOLECLIND
TEL
TRMITHIE
FOSDREI
FARY
LEAFRED
ELLOK
ADWAL 
----
1.0% of process completed
----
 EAD
LANLAN
CHACLEY
WONMAN
CUBAN
GRISGANF
MEVAN
WEYMEN
CYMANE
LINKEWARNELIHR
DSANNEVINS
GALYMARED
DAUCHAF
JAGELRICKY
BKANBENTREN
DUED
BILBULTEFHERN
BRENZOSTYTORIN
DAVI
LEUSEFFHES
BURTNAD
EMDAUCN
HALARO 
----
2.0% of process completed
----
 ILCY
BYDEUERAR
WIGHAN
HANIALLEN
ODLANTOLBBED
JACLINN
CORBY
TYSTON
ARLEX
ANANTO
JARSOMELANPIT
LOMIEN
ENENCER
BRICALDO
BAECILLY
JARTY
GERCIRIT
REICBLEN
TOMXT
HARBELARER
SLIUN
BAREG
FROLTON
ILIATS
AKIO
P 
----
3.0% of process completed
----
 AR
MIQTURT
LONIE
ROBAN
KERNAN
RARNAES
NA

----
 ACHYLLAEL
DONSO
DEET
DOTE
DENTO
DEMAKEW
MALQUEL
AVORDO
SWAYRON
NUINA
AND
RUGANDOLLY
CLISLAY
TIRMELTON
UST
WAY
TENEL
NOLSPHON
FERTON
DILF
PARUSTE
MALEY
TOBILIJAMIEO
CYS
ADDERATHUC
MALPO
GAUMISOS
VANDY 
----
34.0% of process completed
----
 NECILANFERITR
RYANTOLY
KIWEY
JEFFREY
FLANCOLUS
DAMORT
WILTODSIE
HEENWORN
FROZUL
JOSE
NEYSTOR
ABEL
RAVEL
JORE
DONCEINY
STERMERFO
JERE
TWED
LOEMAUGUCK
HANQUIN
LIY
NENDO
GASH
VICHEL
EMMAUD
ANGON
BULAN
H 
----
35.0% of process completed
----
 UILEY
NOLAALOT
RAALIS
NEALY
DONE
LAEL
OTRITE
ROCK
ROSIORESSIN
GURASHARL
TEMAROL
HELTON
WELIGHAAREL
SODO
PERABLERTOY
NEOLDACMARONI
BIGE
GUANTON
PORON
DACISENDAN
BRICK
MISORTY
REDRE
CASTO
DARRIG
STAMONU 
----
36.0% of process completed
----
 OR
DUONLEO
ABRON
MALTON
DELICHUSTIEF
CIMIO
DOMAR
ALLIN
DADRRE
BRANNERTINNE
JEPPIEL
BRIT
ZECH
LANHEL
DOSELO
ALLIBRODERTER
JICK
KEMO
CLUCENCELL
JAIM
VINNETTOXCORDERENT
WOOLL
DARCILGING
ELIF
TYFO
FREDERI 
----
37.0% of process completed
----
 LUNZ
LARERE
TOPE
RAMMY
JILVIAL
JOSCE

----
 UNTO
EXWALDON
KARROLOS
FHANMICO
ZASAMET
PERWERD
NUCHY
ROBEIOBI
SODBIN
PHENDO
ELIOMO
CRUIE
ALIS
ARL
BOND
DACHARUTTHORL
TREDIEY
RUSTYRO
PORHIL
DOLLOB
LAVER
DENAL
BRANDEFRET
STASY
CURTY
EMMYRITHED
RUPEL 
----
68.0% of process completed
----
 ON
STORYRADNE
JAMOHESO
CALIMANIAR
SHAUD
SONNO
GURWINIS
KEMY
ELQUINK
BASRON
YSAUN
SAMGRACE
AUIAWE
IOMENCOR
KEYRO
DOJESE
DARON
CARRY
NOLE
DUNETH
EMICHOBINIO
DIUL
ALDO
LUSQUANBEY
ANVIDRIMELL
GISORBICAS
D 
----
69.0% of process completed
----
 DERY
DOXIS
QUBURIALBY
EYMETTINCOB
REYSHARBIRY
CLIUS
KARRED
ELBIR
SABIL
TEDNIEL
DWIDMIT
RORADLINEYNO
ADTIS
BORLOT
BLAIS
BARDO
GAUD
STAMEN
MAKISCERTON
DARWILL
EVARKINE
NIOUKINRICK
WINILEN
WISAY
ADRIN
W 
----
70.0% of process completed
----
 ETH
JORIA
LIODO
FERDHASON
ALD
ZOCMARUENT
CLEND
JARALD
BRARJOSE
RYDAL
MORESMILARDIE
STIE
SELOY
KENGELLAX
STON
ELLANNEL
DAMMIR
FERGEL
MARIMUC
LANNEL
GERVINK
NETRET
WITH
CILLILE
EFBERTOLQUER
BRAD
EYMO
W 
----
71.0% of process completed
----
 RO
KIL
DALLY
CLUSED
VAUL
EDWEY
TON
RO

TypeError: Object of type 'bytes' is not JSON serializable

In [31]:
for ii in range(100):
    print(x.randomSample(30))

VROHDIE
ALRYD
KINNAN
CLANGE
AR
RONCY
QUORD
RYMREY
TEDEINERCE
VERNCILLAPHAN
KEY
REWAN
ALARD
NERO
JEWILBERTAN
BALOY
DANNIO
IF
ODWERRRI
DOED
SEGPHELL
PEDE
ORTIAD
MITKELO
WILBEST
CLIFFER
RRY
JERMIN
WELTON
DAMIO
ELLEAR
ILMO
CHRON
TRAY
LEPIOPHY
PHRAD
USTIRON
RICQYOEK
STASHONY
RODS
AY
GER
CILUED
LONCE
KLIAN
CHAR
ALUAL
ORERLAN
DARMAL
ELVIN
MAT
UER
DALFORD
JAAL
JERR
DARWINT
IN
VARIS
WALLY
MARZIE
HERLEY
J
LAXYONA
DALLAROLL
DONMACK
WADO
AEL
JORDIET
MAYOPER
MALVIE
RAS
ERRELBERL
JENEL
JESTERIAND
EDM
ERN
CARLON
LUBENTE
LUSTIA
AURI
LAIN
BEMTLY
MULUNT
CHRICHALL
H
SANE
CLAUNENCYOBER
COLE
BERD
N
WALDO
SSIFT
RHIMERES
KO
DENGO
REY
HERRIJARERUCR
FERLIN
CLEIN
HESTIN
DAXTODLERIEMONTID
VERNE
IRLEDRIC
BRIDGNIE
ALFO
KEREDAR
UDNET
OUGRY
THANNEL
ORDWAYRE
S
OLWIN
ARMER
BRANY
ABORDY
SIVE
LLOYD
LACHRE
CRISTUSTEN
MOREST
UIN
ELICK
AND
EMER
GRAN
JAHOTT
USTIRWY
DON
RICK
BRONT
DENTON
UINN
DUREY
ERWILLYNCE
BAND
NET
IANETH
TIDENCALLYN
WERD
JEGO
G
RADERO
OBMANUS
SEFFORNE
NECIO
ERRADLIALK
LAIN
ANNIE
CHRET
BR
OTH
ELEY
GARY
ME

In [32]:
nms = [x.randomSample(12) for f in range(500000)]
nms

['EDARY\nEMMIT',
 'NGARNAY\nRAGM',
 'EWARACONCEDE',
 'AROLD\nJONA\nR',
 'ACHERISELLIN',
 'RRIA\nNINNZHU',
 'AFF\nCHENCCO',
 'END\nRAYO\nJEE',
 'CK\nBMAW\nTHY',
 'ANHIEL\nSHAN',
 'N\nGEONE\nWELB',
 'HADFRE\nGIVAN',
 'ERGERZAREKNA',
 'RDUCK\nDISTON',
 'LLE\nTENRIL\nB',
 'ILITH\nSEMMIK',
 'ARUT\nDARES\nJ',
 'ERNET\nWILPEH',
 'LBRY\nGILBERT',
 'AUD\nOLMIN\nDU',
 'WARD\nREGAR\nW',
 'YO\nANNY\nTUSI',
 'LAEL\nANDY\nHE',
 'ARBER\nMERLY',
 'ENELLEY\nMAYD',
 'IFGED\nLUSTIG',
 'URLCORDAY\nJE',
 'AMINAR\nWARKO',
 'UPS\nARD\nRUCE',
 'OAN\nBRISTON',
 'IN\nJUNEBERT',
 'CERTY\nJUMACH',
 'ACT\nHARRIC\nG',
 'AYAME\nCLARTT',
 'S\nVINCELL\nRE',
 'AND\nCORC\nDAN',
 'ONGE\nCHILIJO',
 'TERME\nHULTON',
 'UISOS\nLAIGNE',
 'AUDO\nTREY\nLO',
 'USSUSTHO\nCEL',
 'BIENIMAL\nGAN',
 'IFF\nRODGRIE',
 'SHAINE\nCOSSH',
 'AND\nCLAUDEAM',
 'EONG\nSRAD\nJO',
 'UY\nGRANEY\nJO',
 'ARO\nCLAUDEC',
 'INZARY\nPERY',
 'RANKON\nROY\nN',
 'ODAVIN\nLORIS',
 'TEDERIEL\nWAL',
 'ARIMALEDERNA',
 'Y\nRUFILFRILL',
 'UDUANN\nDON\nJ',
 'OND

In [37]:
ll = [ii.splitlines() for ii in nms if "B" in ii.upper()]
flattened_list = [y for x in ll for y in x]
flattened_list2 = [f for f in flattened_list if "BUC" in f.upper()]
np.unique(flattened_list2)

array(['AHIUBUCO', 'ARRELLIBUCK', 'BUC', 'BUCALLUG', 'BUCALPAHE', 'BUCARD',
       'BUCDON', 'BUCEN', 'BUCERT', 'BUCH', 'BUCHEN', 'BUCK', 'BUCKIL',
       'BUCKRI', 'BUCKY', 'BUCLAHS', 'BUCO', 'BUCS', 'CHIBUC', 'ELBUCK',
       'EMABUCE', 'EMESOBUCK', 'HUBUCE', 'IBEDUBUCL', 'IBUCHA', 'IMBUCE',
       'INCYREMOBUCH', 'ITHESIBUCH', 'KUBUCKER', 'LLIBBUCE', 'OABUCDES',
       'OBUC', 'OBUCAN', 'OBUCLAN', 'ODBUCAR', 'ROBUCT', 'RUBUCH',
       'SOLIBUCR', 'TREBUCE', 'UBUC', 'UBUCARLEN', 'UBUCARLIVIS', 'UBUCE',
       'UBUCHARER', 'UBUCT', 'UNCLIBUC'],
      dtype='<U12')

In [38]:
ll = [ii.splitlines() for ii in nms if "F" in ii.upper()]
flattened_list = [y for x in ll for y in x]
flattened_list2 = [f for f in flattened_list if "FART" in f.upper()]
np.unique(flattened_list2)

array(['AFFARTHE', 'ALFARTE', 'ANWOLFARTT', 'ARFART', 'EFART', 'EFARTO',
       'EFARTY', 'EFFARTINE', 'FART', 'FARTE', 'FARTEI', 'FARTHA', 'FARTI',
       'FARTIR', 'FARTIS', 'FARTU', 'FARTWALD', 'FARTY', 'GALFART',
       'GILFARTRE', 'GIOLFARTHIUS', 'IFFARTH', 'IFFARTI', 'IFFARTIBES',
       'ILFARTI', 'ILFARTOLL', 'JILFARTY', 'JOFFART', 'LDOFART', 'LFART',
       'LFARTER', 'LFARTH', 'LFARTHONENTO', 'LFARTIE', 'LFARTT', 'OFART',
       'OLFARTY', 'OTOLFARTH', 'OWFARTH', 'RALFEROLFART', 'REFARTOLL',
       'RIFFARTHY', 'SEFFART', 'WILFARTI'],
      dtype='<U12')

In [5]:
import pandas as pd

In [19]:
aa = np.array(np.transpose(pd.read_csv("names.txt", sep = " ", header = None, )))
aa[0:10, :]

array([['Aache'],
       ['Aanwas'],
       ['Aaron'],
       ['Abaet'],
       ['Abarden'],
       ['Abbadon'],
       ['Abbe'],
       ['Abbo'],
       ['Abe'],
       ['Aberbysion']], dtype=object)

In [21]:
"Sarg" in  aa

False

In [45]:
# read in names and output it in the same format as names.txt
bb = pd.read_table("dist.male.first.txt", header = None, delimiter=r"\s+")

In [46]:
bb.head()

Unnamed: 0,0,1,2,3
0,JAMES,3.318,3.318,1
1,JOHN,3.271,6.589,2
2,ROBERT,3.143,9.732,3
3,MICHAEL,2.629,12.361,4
4,WILLIAM,2.451,14.812,5


In [44]:
bb.loc[1,:]

0    JOHN           3.271  6.589      2
Name: 1, dtype: object

In [47]:
pd.read_csv("names.txt", sep = " ", header = None, )

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4102,4103,4104,4105,4106,4107,4108,4109,4110,4111
0,Aache,Aanwas,Aaron,Abaet,Abarden,Abbadon,Abbe,Abbo,Abe,Aberbysion,...,Woon,Worf,Wotan,Wrall,Wrathran,Wraythe,Wrothag,Wulf,Wulfgrim,Wuthmon


In [50]:
cc = bb.loc[:, 0]

In [70]:
dd = pd.DataFrame(cc.values.reshape([1, len(bb)]))

In [72]:
dd.to_csv("names2.txt", sep = " ", header = None, index = False)