In [42]:
from rnn import Rnn
import numpy as np
import random

# note: use python2, not 3

In [43]:
class NameRnn(Rnn):
    '''
    RNN which learns from a list of names
    '''

    def __init__(self):
        # note default is names.txt, I made names2.txt
        super(NameRnn, self).__init__('names2.txt')
        self.minimum_name_length = 3
        self.maximum_name_length = 7
        self.iterations_per_log = 1000

    def validate(self,name):
        '''Validate that the first char and length are appropriate'''
        return (self.is_letter(name[0]) and self.is_acceptable_length(name))

    def is_letter(self, character):
        return ord(character) in range(65,90)

    def is_acceptable_length(self, name):
        return self.meets_maximum_length(name) and self.meets_minimum_length(name)

    def meets_minimum_length(self, name):
        return len(name) >= self.minimum_name_length

    def meets_maximum_length(self, name):
        return len(name) <= self.maximum_name_length

    def get(self,num):
        '''Gets a list of generated names'''
        names = []
        while len(names) < num:
            start_char_id = random.randint(0, len(self.char_to_ix)-1)
            start = self.ix_to_char[start_char_id]

            self.hprev = np.random.randn(len(self.hprev), 1)
            sample_ix = self.sample(self.char_to_ix[start], 30, training=False)
            txt = ''.join(self.ix_to_char[ix] for ix in sample_ix)

            # Clean up
            for name in txt.split():
                if self.validate(name):
                    names.append(name.capitalize())
        self.saveParameters()
        return names

    def print_names(self, rows, columns):
        for name in [self.get(columns) for i in range(rows)]:
            out = ''
            for x in name:
                out += x.ljust(15)
            print(out)

    def step(self,p):
        '''Does the heavy lifting'''
        smooth_loss = -np.log(1.0/self.vocab_size)*self.seq_length # loss at iteration 0
        # prepare inputs (we're sweeping from left to right in steps seq_length long)
        if p+self.seq_length+1 >= len(self.data):
            self.hprev = np.zeros((self.hidden_size,1)) # reset RNN memory
            p = 0 # go from start of data

        inputs = [self.char_to_ix[ch] for ch in self.data[p:p+self.seq_length]]
        targets = [self.char_to_ix[ch] for ch in self.data[p+1:p+self.seq_length+1]]

        # forward seq_length characters through the net and fetch gradient
        loss, dWxh, dWhh, dWhy, dbh, dby = self.lossFun(inputs, targets)
        smooth_loss = smooth_loss * 0.999 + loss * 0.001

        # perform parameter update with Adagrad
        for param, dparam, mem in zip([self.Wxh, self.Whh, self.Why, self.bh, self.by],
                [dWxh, dWhh, dWhy, dbh, dby],
                [self.mWxh, self.mWhh, self.mWhy, self.mbh, self.mby]):
            mem += dparam * dparam
            param += -self.learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

        next_ind = p + self.data[p:p+self.seq_length].index(' ')+1 # move data pointer
        #print(self.data[p:next_ind])
        return next_ind

In [44]:
x = NameRnn()

data has 8193 characters, 28 of which are unique.
expected bytes-like object, not str


In [102]:
x.train(num_iterations=1000)

----
 UD
EZO
DERE
HAPHESTEMBREY
JEY
JAGELN
CLYME
HUPO
ERRID
ERGO
NARY
CYLOND
KAICIO
BUBTY
PADRNIARD
DID
BRICE
EDRUSSIN
RODY
ELEZO
GARMIQUARCALD
KRY
CHALMONCY
CALCER
LUZB
ING
ARANGE
NACHER
CURON
HENAD
JERMEL 
----
0.0% of process completed
Training completed.


TypeError: Object of type 'bytes' is not JSON serializable

In [112]:
nms = [x.randomSample(28) for f in range(500000)]

In [130]:
ll = [ii.splitlines() for ii in nms if "BUC" in ii]
flattened_list = [y for x in ll for y in x]
flattened_list2 = [f for f in flattened_list if "BUCK" in f]
np.unique(flattened_list2)

array(['BUCK', 'BUCKAN', 'BUCKD', 'BUCKE', 'BUCKEL', 'BUCKEO', 'BUCKER',
       'BUCKERWARL', 'BUCKES', 'BUCKEY', 'BUCKIE', 'BUCKIEN',
       'BUCKILINANK', 'BUCKIN', 'BUCKISAN', 'BUCKO', 'BUCKOLEN',
       'BUCKRANDEMER', 'BUCKS', 'BUCKY', 'COBBUCK', 'PEBUCK', 'SBUCK'],
      dtype='<U12')

In [129]:
flattened_list2 = [f for f in flattened_list if f.startswith("R")]
np.unique(flattened_list2)

array(['R', 'RADAN', 'RADDY', 'RADIE', 'RAIG', 'RAIGHER', 'RAIN',
       'RALEVEMYLER', 'RALPEL', 'RAM', 'RAMANAHENE', 'RANARDAN',
       'RANDEVIO', 'RANGROLT', 'RANN', 'RANRAND', 'RARWIL', 'RASSAN',
       'RDAVON', 'REAREL', 'REBBY', 'REBIEL', 'REBUCE', 'REDO', 'REFRI',
       'REFSED', 'REFUD', 'REGRYL', 'REGURE', 'REID', 'RELIO', 'RENAR',
       'RENEBE', 'RENEL', 'RENO', 'REONSEM', 'RESTID', 'REVIEL', 'REVIM',
       'REWINOLEY', 'REY', 'REYMAN', 'REYMIN', 'RI', 'RIBUC', 'RICE',
       'RICK', 'RICKY', 'RIEFE', 'RIK', 'RILIEBIULLIN', 'RIONBY', 'RKEL',
       'RO', 'ROBDO', 'ROBIN', 'ROBON', 'ROBROY', 'ROBUCCHEO', 'ROBUCE',
       'ROBUCERTHIABLON', 'ROBUCQUIN', 'RODE', 'RODRA', 'ROEN', 'ROHE',
       'ROM', 'ROMAND', 'ROMER', 'ROMEY', 'ROMICCO', 'RON', 'RONGY',
       'RONSHEOSINE', 'RORO', 'ROSCE', 'ROSCHER', 'ROSE', 'ROSSANDALDER',
       'ROSTON', 'ROY', 'RUBALD', 'RUBEG', 'RUBERUGLUK', 'RUBUCE',
       'RUBUCIERY', 'RUCKE', 'RUD', 'RUDAN', 'RUDOLI', 'RUFOR', 'RUSTIAL',
      

In [116]:
len(np.unique(flattened_list))

1225

In [105]:
nlst = []

for ii in np.arange(100): 
    print(x.randomSample(18))

TIT
MARKE
ARLEL
RA
NARTON
DIERRICL
J
HIMAN
FANAUGHAR
R
ER
TEMAR
OUSCODAN
E
ALMIRCHARS
GOUN
MARRON
ROLE
SANT
H
ISTY
SHENNEY
ANTY
AMMIN
DARAS
ENG
H
ANDIALDO
CARBY
DAR
AN
EMERNITON
JONAN
Y
TONRY
PONIO
DENN
TANDO
MATCHARF
RE
AIAL
JARIC
HORON
J
TRENN
CLLOS
ERMAA
TONTER
HENROLFIAC
AR
STON
CELVITHMIC
PAMIA
LEWISENDAN
TANDIN
SENE
DUIL
WINTRIO
DARID
ROD
LEWISTOELIH
DARRI
TORTE
ESWICAR
JON
O
BENCY
OSMENE
MAV
ON
RIMON
SULON
FOU
AND
SENTON
WOON
RE
ACMARIAK
PHARL
IGA
A
SONY
STOH
NOLLY
EWIS
BRUCO
PARRIOS
SON
EDWIST
MOCITR
E
DEGETHONE
ELEX
E
JOSMO
ES
LEWIS
AM
EFFORE
SSADE
MICRI
LIN
MALILON
BRACU
NOVELFRED
CORTAN
INCO
MORT
JERE
IST
US
SYNLYL
SERENTON
DEAN
ONAND
GIVIS
OBAN
ROGLACH
ESRON
IN
FREDOYMUNDO
MAG
MARCCORTIE
KREYNE
CARIO
WAMARNE
STIS
ALVIN
CELLON
AHIAS
ELL
JEAN
EFFERNEL
DALVITIA
DONNY
CE
ANT
HINANCER
JEDVA
ILBOIS
MOTHERARO
OHNOTJAIN
SSANCE
A
ARLON
GELL
EDAM
NA
A
KIALAN
VASEAY
JE
AH
ELIO
LOREL
TRY
LECHAN
HADANUSTOR
TAM
FORNELON
DOSAU
DAM
TITHRISTOA
ROC
ONNDO
MARAND
SILK
ASC
JANAH
HMINE
GA
EN
AUC

In [88]:
nlst

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [8]:
import pandas as pd

In [19]:
aa = np.array(np.transpose(pd.read_csv("names.txt", sep = " ", header = None, )))
aa[0:10, :]

array([['Aache'],
       ['Aanwas'],
       ['Aaron'],
       ['Abaet'],
       ['Abarden'],
       ['Abbadon'],
       ['Abbe'],
       ['Abbo'],
       ['Abe'],
       ['Aberbysion']], dtype=object)

In [41]:
"Ab" in  aa

NameError: name 'aa' is not defined

In [45]:
# read in names and output it in the same format as names.txt
bb = pd.read_table("dist.male.first.txt", header = None, delimiter=r"\s+")

In [46]:
bb.head()

Unnamed: 0,0,1,2,3
0,JAMES,3.318,3.318,1
1,JOHN,3.271,6.589,2
2,ROBERT,3.143,9.732,3
3,MICHAEL,2.629,12.361,4
4,WILLIAM,2.451,14.812,5


In [44]:
bb.loc[1,:]

0    JOHN           3.271  6.589      2
Name: 1, dtype: object

In [47]:
pd.read_csv("names.txt", sep = " ", header = None, )

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4102,4103,4104,4105,4106,4107,4108,4109,4110,4111
0,Aache,Aanwas,Aaron,Abaet,Abarden,Abbadon,Abbe,Abbo,Abe,Aberbysion,...,Woon,Worf,Wotan,Wrall,Wrathran,Wraythe,Wrothag,Wulf,Wulfgrim,Wuthmon


In [50]:
cc = bb.loc[:, 0]

In [70]:
dd = pd.DataFrame(cc.values.reshape([1, len(bb)]))

In [72]:
dd.to_csv("names2.txt", sep = " ", header = None, index = False)