In [2]:
from rnn import Rnn
import numpy as np
import random

# note: use python2, not 3

In [3]:
class NameRnn(Rnn):
    '''
    RNN which learns from a list of names
    '''

    def __init__(self):
        # note default is names.txt, I made names2.txt
        super(NameRnn, self).__init__('femaleNames.txt')
        self.minimum_name_length = 3
        self.maximum_name_length = 7
        self.iterations_per_log = 1000

    def validate(self,name):
        '''Validate that the first char and length are appropriate'''
        return (self.is_letter(name[0]) and self.is_acceptable_length(name))

    def is_letter(self, character):
        return ord(character) in range(65,90)

    def is_acceptable_length(self, name):
        return self.meets_maximum_length(name) and self.meets_minimum_length(name)

    def meets_minimum_length(self, name):
        return len(name) >= self.minimum_name_length

    def meets_maximum_length(self, name):
        return len(name) <= self.maximum_name_length

    def get(self,num):
        '''Gets a list of generated names'''
        names = []
        while len(names) < num:
            start_char_id = random.randint(0, len(self.char_to_ix)-1)
            start = self.ix_to_char[start_char_id]

            self.hprev = np.random.randn(len(self.hprev), 1)
            sample_ix = self.sample(self.char_to_ix[start], 30, training=False)
            txt = ''.join(self.ix_to_char[ix] for ix in sample_ix)

            # Clean up
            for name in txt.split():
                if self.validate(name):
                    names.append(name.capitalize())
        self.saveParameters()
        return names

    def print_names(self, rows, columns):
        for name in [self.get(columns) for i in range(rows)]:
            out = ''
            for x in name:
                out += x.ljust(15)
            print(out)

    def step(self,p):
        '''Does the heavy lifting'''
        smooth_loss = -np.log(1.0/self.vocab_size)*self.seq_length # loss at iteration 0
        # prepare inputs (we're sweeping from left to right in steps seq_length long)
        if p+self.seq_length+1 >= len(self.data):
            self.hprev = np.zeros((self.hidden_size,1)) # reset RNN memory
            p = 0 # go from start of data

        inputs = [self.char_to_ix[ch] for ch in self.data[p:p+self.seq_length]]
        targets = [self.char_to_ix[ch] for ch in self.data[p+1:p+self.seq_length+1]]

        # forward seq_length characters through the net and fetch gradient
        loss, dWxh, dWhh, dWhy, dbh, dby = self.lossFun(inputs, targets)
        smooth_loss = smooth_loss * 0.999 + loss * 0.001

        # perform parameter update with Adagrad
        for param, dparam, mem in zip([self.Wxh, self.Whh, self.Why, self.bh, self.by],
                [dWxh, dWhh, dWhy, dbh, dby],
                [self.mWxh, self.mWhh, self.mWhy, self.mbh, self.mby]):
            mem += dparam * dparam
            param += -self.learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

        next_ind = p + self.data[p:p+self.seq_length].index(' ')+1 # move data pointer
        #print(self.data[p:next_ind])
        return next_ind

In [4]:
x = NameRnn()

data has 30040 characters, 28 of which are unique.
expected bytes-like object, not str


In [5]:
x.train(num_iterations=5000)

----
 FVNZCBLHLXTAYZQSMOGUBMSXGCAYILBTNUMPMTQOGD
CJUDDKNQJAOZZJEOGXLHPY
TBPQEAEUGJUYESHMDDBY
ATLZNSFBSGPUVILCCHFELUFCDN
CTO
BTQJQJYQPTCXQDGLGVYVYPIVLMRHXUVCKC
AVSECJSHBTKDRUWTHVFESISZJZVMX
MUEPABPQTAHCYGPUM 
----
0.0% of process completed
----
 ESAN
JERET
MEITE
VIRIETENE
TEBRTA
ELYNA
WHRI
ASHARGA
MAVITEL
JOANE
LOTRAR
ERYELLAN
JOMELITA
GARGATTA
ALWELZA
MERRITLORIA
FLERA
MERLEL
COURANISTA
JEMA
CELLI
DEABENELOTMA
MARILI
A
MERTA
FFEDISTA
RYDE
C 
----
20.0% of process completed
----
 TE
MARELE
ELSNEL
CHINNA
CERRHOSHEL
HANNA
SSHRINGCARY
JADEVENI
RORORA
KONDE
MARIN
DARTHIANN
ALITTOLGA
VESHA
CHRLDATTA
LORE
ERNA
ERLY
ZANBEN
VESLIDRY
CAROOMDINDIANDA
TOLICA
MERACAN
SUNIDA
NABSIN
VARRINA 
----
40.0% of process completed
----
 KAY
VAINENA
JEMAT
NOMALVEX
ARTENO
CETRIE
LAMIVY
FAUDA
AUDA
ARVAZIENDE
MAHY
BOROSTA
MOINE
ANDICE
ORANILA
BANDE
LERWEKA
ILETTA
LIU
ROLARGIO
AMVENDI
SHETENY
VRICQUELIE
ERANIA
KAMILOSHDA
AONELEVELINA
ERWY 
----
60.0% of process completed
----
 ELLE
CEALDRINDA
VEFESTYEE
FLINA
UCEL

In [14]:
nms = [x.randomSample(28) for f in range(5000)]

In [20]:
ll = [ii.splitlines() for ii in nms if "T" in ii]
flattened_list = [y for x in ll for y in x]
flattened_list2 = [f for f in flattened_list if "T" in f]
np.unique(flattened_list2)

array(['AACKRETH', 'ABBITTA', 'ABETTI', ..., 'ZUSTIBA', 'ZUTKACE',
       'ZUTRISTIEY'], dtype='<U22')

In [29]:
flattened_list2 = [f for f in flattened_list if f.startswith("S")]
np.unique(flattened_list2)

array(['S', 'SA', 'SABDY', 'SABGELBELET', 'SABHIE', 'SABORA', 'SABRIN',
       'SABRYN', 'SACATHERI', 'SACIBE', 'SACLIEE', 'SACRINN', 'SADETTIE',
       'SADIENN', 'SADRA', 'SADRACIRA', 'SADRIAN', 'SAGINA',
       'SAGNOBELINE', 'SAGSEBRISTA', 'SAGTISIE', 'SAHELETTI', 'SAISSYE',
       'SALBINIMANNA', 'SALELARIH', 'SALI', 'SALINE', 'SALRI', 'SAMA',
       'SAMAREET', 'SAMBRISTE', 'SAMSATHEL', 'SAN', 'SANDA', 'SANDIELA',
       'SANDRE', 'SANGALIA', 'SANGEE', 'SANGI', 'SANIALLE', 'SANISANDY',
       'SANISANNA', 'SANMAE', 'SANNENA', 'SANRETENA', 'SANVIRIADE',
       'SARCORUELE', 'SARGEBESTINA', 'SARRYE', 'SASRANE', 'SASTEVER',
       'SASTORAWGEINE', 'SATHA', 'SATIN', 'SATS', 'SAUBERA', 'SAUMYE',
       'SAUNITONITO', 'SAVYTTA', 'SAXIKORY', 'SBETTECHONSIE', 'SCHERIEL',
       'SCSARVIDE', 'SE', 'SEANCHE', 'SEBET', 'SEDIE', 'SEELA', 'SEFIN',
       'SEG', 'SEINDIE', 'SELDE', 'SELE', 'SELENE', 'SELENIE',
       'SELETIMATHELRA', 'SELGIE', 'SELI', 'SELIA', 'SELIE', 'SELINA',
       'SELIO

In [None]:
len(np.unique(flattened_list))

In [None]:
nlst = []

for ii in np.arange(100): 
    print(x.randomSample(18))

In [None]:
nlst

In [None]:
import pandas as pd

In [None]:
aa = np.array(np.transpose(pd.read_csv("names.txt", sep = " ", header = None, )))
aa[0:10, :]

In [None]:
"Ab" in  aa

In [None]:
# read in names and output it in the same format as names.txt
bb = pd.read_table("dist.male.first.txt", header = None, delimiter=r"\s+")

In [None]:
bb.head()

In [None]:
bb.loc[1,:]

In [None]:
pd.read_csv("names.txt", sep = " ", header = None, )

In [None]:
cc = bb.loc[:, 0]

In [None]:
dd = pd.DataFrame(cc.values.reshape([1, len(bb)]))

In [None]:
dd.to_csv("names2.txt", sep = " ", header = None, index = False)