In [73]:
from rnn import Rnn
import numpy as np
import random

# note: use python2, not 3

In [74]:
class NameRnn(Rnn):
    '''
    RNN which learns from a list of names
    '''

    def __init__(self):
        # note default is names.txt, I made names2.txt
        super(NameRnn, self).__init__('names2.txt')
        self.minimum_name_length = 3
        self.maximum_name_length = 12
        self.iterations_per_log = 1000

    def validate(self,name):
        '''Validate that the first char and length are appropriate'''
        return (self.is_letter(name[0]) and self.is_acceptable_length(name))

    def is_letter(self, character):
        return ord(character) in range(65,90)

    def is_acceptable_length(self, name):
        return self.meets_maximum_length(name) and self.meets_minimum_length(name)

    def meets_minimum_length(self, name):
        return len(name) >= self.minimum_name_length

    def meets_maximum_length(self, name):
        return len(name) <= self.maximum_name_length

    def get(self,num):
        '''Gets a list of generated names'''
        names = []
        while len(names) < num:
            start_char_id = random.randint(0, len(self.char_to_ix)-1)
            start = self.ix_to_char[start_char_id]

            self.hprev = np.random.randn(len(self.hprev), 1)
            sample_ix = self.sample(self.char_to_ix[start], 30, training=False)
            txt = ''.join(self.ix_to_char[ix] for ix in sample_ix)

            # Clean up
            for name in txt.split():
                if self.validate(name):
                    names.append(name.capitalize())
        self.saveParameters()
        return names

    def print_names(self, rows, columns):
        for name in [self.get(columns) for i in range(rows)]:
            out = ''
            for x in name:
                out += x.ljust(15)
            print(out)

    def step(self,p):
        '''Does the heavy lifting'''
        smooth_loss = -np.log(1.0/self.vocab_size)*self.seq_length # loss at iteration 0
        # prepare inputs (we're sweeping from left to right in steps seq_length long)
        if p+self.seq_length+1 >= len(self.data):
            self.hprev = np.zeros((self.hidden_size,1)) # reset RNN memory
            p = 0 # go from start of data

        inputs = [self.char_to_ix[ch] for ch in self.data[p:p+self.seq_length]]
        targets = [self.char_to_ix[ch] for ch in self.data[p+1:p+self.seq_length+1]]

        # forward seq_length characters through the net and fetch gradient
        loss, dWxh, dWhh, dWhy, dbh, dby = self.lossFun(inputs, targets)
        smooth_loss = smooth_loss * 0.999 + loss * 0.001

        # perform parameter update with Adagrad
        for param, dparam, mem in zip([self.Wxh, self.Whh, self.Why, self.bh, self.by],
                [dWxh, dWhh, dWhy, dbh, dby],
                [self.mWxh, self.mWhh, self.mWhy, self.mbh, self.mby]):
            mem += dparam * dparam
            param += -self.learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

        next_ind = p + self.data[p:p+self.seq_length].index(' ')+1 # move data pointer
        #print(self.data[p:next_ind])
        return next_ind

In [75]:
x = NameRnn()

data has 8193 characters, 28 of which are unique.


In [76]:
x.train(num_iterations=100000)

----
 FGWGFTNNKUOCP
EVHKTHAPT
JBVQJCHYKSO
PSDVJQREBNROLGSROCFRGC
CHHPLR
FNSKHHUBSISKAPRHXXV
CSNU
RW
IRVHH
ZAGMRIMMLAFSLPFZMBAHT
VEBSHB
IELA
SNLPWHGTZDZBALOXGVDVJKVGQFOBFB
NAMZXICCBHDWPTBUWOQHBLCAJYZFLR 
----
0.0% of process completed
----
 IL
WILFFIOB
HONHADBRYDE
MANEL
JOBMEON
WARFMOR
QAVIL
JORBARLMATTHALDLXIL
KUUP
MICOP
FICOBY
REFPON
TONY
RERDILO
KODEMOL
PELTEN
COR
CANITH
JUDUY
MIR
FIED
RICK
QUIFORDEN
MAGEEP
RICO
JOLE
JEFTILT
CHARUE
BU 
----
1.0% of process completed
----
 OLLE
GANTLERTON
KEARISTE
ERICOT
BARRY
YLLON
BERVIN
REISTO
PHENY
WANIUMEN
NOL
KOL
QLAENVIJALLARO
COMRIN
ELCO
JAUMUSTY
CORY
TTROAN
GROHON
MILIOY
MARDOREMIL
AMERY
STODABEREY
ANDENY
CKELMON
OLASON
CARNO 
----
2.0% of process completed
----
 FIMES
DAUSTINZOLAIL
CONVINCO
GEREN
TOULE
OSCON
AMBRY
ATHARY
EVIS
BAYDONAN
DYANDER
ERESODROD
JISENSACWISAULRFDONEL
ENVANCIEDONNIANDAT
LUMALL
MARLANLINLANETHONOR
ATERN
STALLON
RUDORS
GELDOND
KEWIOLCY
V 
----
3.0% of process completed
----
 EN
SODREY
ANDO
MARLUN
ARLAN
TERENN
KERGOLE
FRE

----
 E
JEHD
KON
MEY
JAMALY
LOTY
JERUST
DANDIET
SEBERT
FRADRYAN
GIRON
SEMMIE
HUNNERS
HELFITILLIT
LANNE
VAINAD
ROVARDONECERTON
LAMINEL
HILEN
DUVIL
JOSANDO
CIMARTON
LAVIRUNE
MALFORN
DELED
MYSVIN
DARRTEN
MURVI 
----
35.0% of process completed
----
 K
DON
ENZODIABLIX
BRADRICHAWSUM
MONGULE
CLIMIGS
IRIN
ROSIE
MARON
KRANDO
ALTOX
ZACKAUR
MUNKEY
JOSHEH
TODANY
KELLUY
DONNY
PUDAH
PETRTO
FRONBLIC
ALLIC
TRAYUEL
MALO
FROYROEL
JUMORT
ROOMANTON
QUIRSALL
SHAU 
----
36.0% of process completed
----
 SECK
PATTRISTEFRENCIELT
MATTED
ABRAN
JUDLEEL
JAKE
NETLEPH
BDOMINAHAS
NAMART
HANARL
BRLET
DUDRES
QUIE
BERMATHO
MASHOL
WILES
DELERMONE
GARAEL
BITRO
COLVIN
ADGURCON
KID
DEEGED
COUR
DOAN
TON
CCERROL
LINGE 
----
37.0% of process completed
----
 ELL
GIVY
JALE
DARDEN
SRON
PAMEN
ANCAS
ERCERBERMONCOND
TOMENO
ELDIR
GERIE
TALIACIAN
WIRVARDORO
COET
BIYMER
CEE
FREINTINTON
LERWEW
GAIRINRY
JALUSLAN
DELGURO
JAUSEN
GARWICK
LERLESHEN
BRENG
SHAMEN
BRENCIA 
----
38.0% of process completed
----
 ARRARWADRYAN
JONAME
TRANTIE
MUCK
H

----
 ERT
HARYMARRICONE
TRIWGATNAY
LATNEY
JUIS
WAYVER
GINO
FORAST
AULL
BORTO
DARIE
MAHUL
BURTO
IKALAND
SOD
LAUGIE
RICOE
DON
LIC
MIKE
DEVAL
WILUIG
RYDALAUS
EIFRAD
ARILLAR
JOWCOENTT
RICOLO
ALVASEY
HARIY
JAMY 
----
70.0% of process completed
----
 AILL
ELDARLE
WARVAN
FERNALL
FRASALLEVARRON
ROY
ARARTO
GRANCE
COR
CHLEO
MOLLINER
BRYLE
ZASTON
STON
DRANKEGISESPH
DALSPA
BELT
GANG
TWILEN
JOH
DARROBAR
JERVALE
DWAYNIE
STEFORDAN
TEYNOLLORDEN
JOAVIS
CYHRI 
----
71.0% of process completed
----
 ON
JORKIS
CLUDON
LEN
HEOT
RODEAS
SHUNVAN
SCE
NAIRDAH
HWILDUS
BERNEIE
BRENUM
K
IFSAN
BUDKE
WAMUR
BILL
WECOREY
FERWALDREY
GARRONIOLLIN
MYLTRABLENTEL
RICK
ROBRY
STANNALDO
MYSON
DONNAUE
DONEX
PHENCIST
GAR 
----
72.0% of process completed
----
 DASYLENTONCERTON
JAMEY
VALONEY
TET
JERUSTHERRO
TANDRICK
ESIBEDIT
RICKIE
LENACH
CHELD
JORT
JADLARTHERGE
LALDIL
JELLBUSTEL
MILL
ARUST
ERUST
VIDSEN
CHARED
TOMAUGIL
ROSTOND
ALDRUSAS
CYRRE
MYROUS
KURKEL
B 
----
73.0% of process completed
----
 N
RICOLY
RUICO
OBREDOYD
LIGEY
GARVER

In [8]:
import pandas as pd

In [19]:
aa = np.array(np.transpose(pd.read_csv("names.txt", sep = " ", header = None, )))
aa[0:10, :]

array([['Aache'],
       ['Aanwas'],
       ['Aaron'],
       ['Abaet'],
       ['Abarden'],
       ['Abbadon'],
       ['Abbe'],
       ['Abbo'],
       ['Abe'],
       ['Aberbysion']], dtype=object)

In [21]:
"Sarg" in  aa

False

In [45]:
# read in names and output it in the same format as names.txt
bb = pd.read_table("dist.male.first.txt", header = None, delimiter=r"\s+")

In [46]:
bb.head()

Unnamed: 0,0,1,2,3
0,JAMES,3.318,3.318,1
1,JOHN,3.271,6.589,2
2,ROBERT,3.143,9.732,3
3,MICHAEL,2.629,12.361,4
4,WILLIAM,2.451,14.812,5


In [44]:
bb.loc[1,:]

0    JOHN           3.271  6.589      2
Name: 1, dtype: object

In [47]:
pd.read_csv("names.txt", sep = " ", header = None, )

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4102,4103,4104,4105,4106,4107,4108,4109,4110,4111
0,Aache,Aanwas,Aaron,Abaet,Abarden,Abbadon,Abbe,Abbo,Abe,Aberbysion,...,Woon,Worf,Wotan,Wrall,Wrathran,Wraythe,Wrothag,Wulf,Wulfgrim,Wuthmon


In [50]:
cc = bb.loc[:, 0]

In [70]:
dd = pd.DataFrame(cc.values.reshape([1, len(bb)]))

In [72]:
dd.to_csv("names2.txt", sep = " ", header = None, index = False)