In [12]:
from rnn import Rnn
import numpy as np
import random

# note: use python2, not 3

In [13]:
import os
os.getcwd()

'C:\\Users\\callin\\Documents\\GitRepos\\GarbageCollector'

In [22]:
import pandas as pd
dd = pd.read_csv("femaleName.txt", header = None)
dd.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4265,4266,4267,4268,4269,4270,4271,4272,4273,4274
0,MARY,PATRICIA,LINDA,BARBARA,ELIZABETH,JENNIFER,MARIA,SUSAN,MARGARET,DOROTHY,...,BRITTENY,BEULA,BARI,AUDRIE,AUDRIA,ARDELIA,ANNELLE,ANGILA,ALONA,ALLYN


In [24]:
dd.transpose().to_csv("femaleNames.txt", sep = " ", index = None, header = None)

In [25]:
class NameRnn(Rnn):
    '''
    RNN which learns from a list of names
    '''

    def __init__(self):
        # note default is names.txt, I made names2.txt
        super(NameRnn, self).__init__('femaleNames.txt')
        self.minimum_name_length = 3
        self.maximum_name_length = 12
        self.iterations_per_log = 1000

    def validate(self,name):
        '''Validate that the first char and length are appropriate'''
        return (self.is_letter(name[0]) and self.is_acceptable_length(name))

    def is_letter(self, character):
        return ord(character) in range(65,90)

    def is_acceptable_length(self, name):
        return self.meets_maximum_length(name) and self.meets_minimum_length(name)

    def meets_minimum_length(self, name):
        return len(name) >= self.minimum_name_length

    def meets_maximum_length(self, name):
        return len(name) <= self.maximum_name_length

    def get(self,num):
        '''Gets a list of generated names'''
        names = []
        while len(names) < num:
            start_char_id = random.randint(0, len(self.char_to_ix)-1)
            start = self.ix_to_char[start_char_id]

            self.hprev = np.random.randn(len(self.hprev), 1)
            sample_ix = self.sample(self.char_to_ix[start], 30, training=False)
            txt = ''.join(self.ix_to_char[ix] for ix in sample_ix)

            # Clean up
            for name in txt.split():
                if self.validate(name):
                    names.append(name.capitalize())
        self.saveParameters()
        return names

    def print_names(self, rows, columns):
        for name in [self.get(columns) for i in range(rows)]:
            out = ''
            for x in name:
                out += x.ljust(15)
            print(out)

    def step(self,p):
        '''Does the heavy lifting'''
        smooth_loss = -np.log(1.0/self.vocab_size)*self.seq_length # loss at iteration 0
        # prepare inputs (we're sweeping from left to right in steps seq_length long)
        if p+self.seq_length+1 >= len(self.data):
            self.hprev = np.zeros((self.hidden_size,1)) # reset RNN memory
            p = 0 # go from start of data

        inputs = [self.char_to_ix[ch] for ch in self.data[p:p+self.seq_length]]
        targets = [self.char_to_ix[ch] for ch in self.data[p+1:p+self.seq_length+1]]

        # forward seq_length characters through the net and fetch gradient
        loss, dWxh, dWhh, dWhy, dbh, dby = self.lossFun(inputs, targets)
        smooth_loss = smooth_loss * 0.999 + loss * 0.001

        # perform parameter update with Adagrad
        for param, dparam, mem in zip([self.Wxh, self.Whh, self.Why, self.bh, self.by],
                [dWxh, dWhh, dWhy, dbh, dby],
                [self.mWxh, self.mWhh, self.mWhy, self.mbh, self.mby]):
            mem += dparam * dparam
            param += -self.learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

        next_ind = p + self.data[p:p+self.seq_length].index(' ')+1 # move data pointer
        #print(self.data[p:next_ind])
        return next_ind

In [26]:
x = NameRnn()

data has 30040 characters, 28 of which are unique.


In [28]:
x.train(num_iterations=100000)

----
 A
CIRDISESTALLE
EDRENY
TADELENA
LETEBRA
CARRE
MORIPE
GERIDITHE
ICELLI
FYANA
RORIA
SHEPYNIKA
AJZOBELDINE
PENICIACELLA
WYLI
JAGELITTRE
SHETHKA
SULI
MARTIE
SHREN
DAPORIT
CORDE
LAORESHAMY
CERTH
ACIONATTA 
----
0.0% of process completed
----
 LOURRA
JRONAKRIIRELICE
DETGEBRA
TALIEI
LIA
NETTETRA
LISZE
MYCIA
MANMANTELIH
MERRA
LARICHALIE
TEBALEN
ELIANDE
MERIETA
FREUDATIA
ELEAXANE
ALIA
NA
FREISODINIRLA
TORY
LOLAQUEN
KELLIVE
GULAN
LAIVERITE
ELIE 
----
1.0% of process completed
----
 INS
ELIGIE
EZIE
RANGIE
ONXINE
CARRA
MARSHA
LOSHEN
CADANILEETTIE
FANE
YAN
MARTHEROSH
LOLIE
LONANELA
APHALLIA
DORAN
FULSHANIL
CHRINE
BEELY
LOHEL
JENISA
LANELANA
LORSTARICE
ALD
CERINA
ZURISY
NENDA
SHAVIE 
----
2.0% of process completed
----
 OROISA
SHOR
JUCHE
JACKY
CKITHA
BICKI
MAPSHUMA
SHAVALISA
ANTIA
CACHARA
DONA
ELEDIDA
KALI
ORANNA
LITISHELENY
ALTH
DAGLAVIL
KERMIDA
WIMBITE
FAWNA
EMYRINA
VNEMMELIA
NOGINA
JURITA
MILU
EDOSHETTE
ARYLYN
PAR 
----
3.0% of process completed
----
 E
LETOICA
ORNASHCEKETH
LORIN
CLOLELLERE

----
 LISTEE
OLGEY
CLAINA
MEINA
AUNET
SENNESTERTHELIE
KRISHALDINTHA
HILUZL
ERELA
ERDADIE
ELELYMIKA
ZARNEDWAN
TRACIE
HIVE
CHAN
CERRANNA
RIQUANNA
SHIRALETISA
PAMMICKIDA
DIESHA
MARCVELAIE
RANNAU
HOLYN
GEHRIAME 
----
34.0% of process completed
----
 SHE
ROXIA
JORILLYN
DALAHN
CORLES
MEURA
TORONELA
KATALY
BELL
JAPA
ANTATIA
FESSIA
ARYLA
JIFLE
CARBILLA
NITA
RUS
CATETHE
LYN
SCARSELIONEVA
WANNIE
SANGA
MARLIE
LIEFRUKATAYSANOROLBRIETH
KARI
RESHE
ASHARI
L 
----
35.0% of process completed
----
 LE
DETTIE
TASHARIA
EVIODAMSTERRNE
TOR
JATELYE
ALLYNI
MARLA
CHARIANNA
GOMAY
VEROLLE
DURMARA
CASNIT
ATILIEDRENIE
ALDICHARYSHA
ENDENIA
CHANNE
FRIENN
KATHY
CLARIE
DANA
ADRISA
THANDA
TOLICA
CHANER
JENCILE 
----
36.0% of process completed
----
 LORISA
TATHEONINE
RISSIE
MASSUNA
AVELLANDES
INELLEPIONKANESEN
TAM
VALEASALVON
AZSHARRENA
BERLE
DIOTHEE
LEEVEDRTSALIA
LANNEN
MELINILA
DELEY
JONELLA
LAUELINA
TATRY
AUE
ROEITA
RACHALMIRVETTA
SHANY
MANKER 
----
37.0% of process completed
----
 RITE
RECEN
MUDONDE
ANDRI
CHALLINE
M

----
 A
KATA
BALEY
LIKO
ELLLE
KAISO
KARITRE
ELELATROLONE
KRISSTA
TESSIESEARKIESHELLA
VARY
LONORRIE
ESA
VIRJACIKIE
PETA
LAELE
RRCLEA
CHILDA
YLLUZE
VANELLLA
STIVANNE
DEFETIN
KLAINTE
MALDIN
AZIQUERLA
BALL
ADED 
----
68.0% of process completed
----
 ERTA
RACHA
MENATIA
ROSEKASA
BELINE
MIY
GELAYELA
MARUILET
COLDELANYO
HEVINA
ILINEG
PORLIE
LOULIE
LINA
NIGREDA
AURELLA
PEMBIE
JUSA
WAJANNA
CHONIE
RIGEWNNI
GORCE
THEROTTA
GRRGENE
JUBOCARIE
SHERASSIE
ANNA 
----
69.0% of process completed
----
 LARUCPERRILLE
SALIN
LOLDETTRE
CHIRHA
JONNANNYON
FLENILA
KORAINN
LEPANDY
KIZEK
MERGEISANA
FRANNA
ANNOZERIKO
LONNA
LENGINNA
RESIA
DELENA
BROSTINGLA
MORQUIAD
CHERE
JACQULONELA
CRORGORBELMY
KARASTA
MIL
J 
----
70.0% of process completed
----
 KELFILA
MERESTIA
MELLEA
KAILEANN
DROLGEPHELINE
ROVA
KYLIN
DEISA
VESTICARY
ANDA
NOR
JULY
IVELLU
OLINA
SARGHEOMBINA
MELA
VALANNA
SHERLEANDRE
LAILOENN
LOSELCHALATAMA
MAYE
ERINA
LUSHIS
EVETH
DRESTIE
SORV 
----
71.0% of process completed
----
 VANDE
SHARIA
MARIHLARIT
MAVIANN
RACD

TypeError: Object of type 'bytes' is not JSON serializable

In [31]:
for ii in range(100):
    print(x.randomSample(30))

VROHDIE
ALRYD
KINNAN
CLANGE
AR
RONCY
QUORD
RYMREY
TEDEINERCE
VERNCILLAPHAN
KEY
REWAN
ALARD
NERO
JEWILBERTAN
BALOY
DANNIO
IF
ODWERRRI
DOED
SEGPHELL
PEDE
ORTIAD
MITKELO
WILBEST
CLIFFER
RRY
JERMIN
WELTON
DAMIO
ELLEAR
ILMO
CHRON
TRAY
LEPIOPHY
PHRAD
USTIRON
RICQYOEK
STASHONY
RODS
AY
GER
CILUED
LONCE
KLIAN
CHAR
ALUAL
ORERLAN
DARMAL
ELVIN
MAT
UER
DALFORD
JAAL
JERR
DARWINT
IN
VARIS
WALLY
MARZIE
HERLEY
J
LAXYONA
DALLAROLL
DONMACK
WADO
AEL
JORDIET
MAYOPER
MALVIE
RAS
ERRELBERL
JENEL
JESTERIAND
EDM
ERN
CARLON
LUBENTE
LUSTIA
AURI
LAIN
BEMTLY
MULUNT
CHRICHALL
H
SANE
CLAUNENCYOBER
COLE
BERD
N
WALDO
SSIFT
RHIMERES
KO
DENGO
REY
HERRIJARERUCR
FERLIN
CLEIN
HESTIN
DAXTODLERIEMONTID
VERNE
IRLEDRIC
BRIDGNIE
ALFO
KEREDAR
UDNET
OUGRY
THANNEL
ORDWAYRE
S
OLWIN
ARMER
BRANY
ABORDY
SIVE
LLOYD
LACHRE
CRISTUSTEN
MOREST
UIN
ELICK
AND
EMER
GRAN
JAHOTT
USTIRWY
DON
RICK
BRONT
DENTON
UINN
DUREY
ERWILLYNCE
BAND
NET
IANETH
TIDENCALLYN
WERD
JEGO
G
RADERO
OBMANUS
SEFFORNE
NECIO
ERRADLIALK
LAIN
ANNIE
CHRET
BR
OTH
ELEY
GARY
ME

In [29]:
nms = [x.randomSample(12) for f in range(500000)]
nms

['PIS\nBEATREA',
 'SULNA\nVERIV',
 'YARLINE\nELI',
 'AMGWILDIE\nMA',
 'IS\nPATTIE\nLA',
 'ORR\nELENY\nAV',
 'A\nTAVINA\nTEN',
 'UKATSIALBEB',
 'ENA\nWALI\nSTA',
 'A\nSYLETA\nGIT',
 'ETHA\nDIELA\nM',
 'AJULAY\nJONNY',
 'IA\nTHARILA\nE',
 'ARGETICHAY\nS',
 'UALLELA\nGEAN',
 'SHAULEH\nGIL',
 'OSTA\nORGA\nVA',
 'WINNA\nRESEB',
 'IAN\nJANDICA',
 'M\nCIFFISON\nS',
 'IE\nMELL\nNINN',
 'UIS\nMARQ\nEUR',
 'ISHA\nASHENE',
 'EANN\nJAHLEE',
 'A\nYONINE\nMAR',
 'NEUKE\nNIAMAD',
 'A\nOXINIE\nKAL',
 'ANCHAL\nCRIST',
 'AN\nDERYNI\nJO',
 'A\nLAUDIE\nLAI',
 'I\nLITAMYENDE',
 'ARY\nZUKING\nK',
 'MARHALMEY\nLA',
 'DALOTHE\nJESU',
 'GALONDE\nDAF',
 'E\nDOLELENI\nS',
 'A\nANNEE\nMANK',
 'US\nKORDESSI',
 'E\nSHYE\nTACYL',
 'BBENCIE\nJAM',
 'TEREN\nJAUNNA',
 'IA\nTINGENIDA',
 'OZIMEY\nKRAND',
 'ERA\nCHRINNA',
 'A\nPHAG\nMORON',
 'A\nCLEODA\nCEI',
 'OL\nVALISHAND',
 'KA\nMAROADRA',
 'NN\nMARSHINCI',
 'ULLA\nFLA\nFVE',
 'ARINA\nKARBEN',
 'ONA\nSANE\nJAN',
 'UENE\nSHORA\nE',
 'A\nGLEN\nLETIN',
 'CTELL\nICATHO',


In [30]:
ll = [ii.splitlines() for ii in nms if "F" in ii.upper()]
flattened_list = [y for x in ll for y in x]
flattened_list2 = [f for f in flattened_list if "FART" in f.upper()]
np.unique(flattened_list2)

array(['FART', 'FARTANNE', 'FARTARETTIA', 'FARTHIEMARY', 'FARTHIROLIT',
       'FARTI', 'FARTIE', 'FARTINE', 'FARTITHIRA', 'FARTOLA', 'FARTTA',
       'FARTY', 'KAIFFARTEU', 'MAIFARTC', 'RAFARTINE'],
      dtype='<U11')

In [42]:
ll = [ii.splitlines() for ii in nms if "C" in ii.upper()]
flattened_list = [y for x in ll for y in x]
flattened_list2 = [f for f in flattened_list if "CHARLES" in f.upper()]
np.unique(flattened_list2)

array(['CHARLES', 'CHARLESI', 'CHARLESROK', 'TACHARLEST'],
      dtype='<U10')

In [5]:
import pandas as pd

In [19]:
aa = np.array(np.transpose(pd.read_csv("names.txt", sep = " ", header = None, )))
aa[0:10, :]

array([['Aache'],
       ['Aanwas'],
       ['Aaron'],
       ['Abaet'],
       ['Abarden'],
       ['Abbadon'],
       ['Abbe'],
       ['Abbo'],
       ['Abe'],
       ['Aberbysion']], dtype=object)

In [21]:
"Sarg" in  aa

False

In [45]:
# read in names and output it in the same format as names.txt
bb = pd.read_table("dist.male.first.txt", header = None, delimiter=r"\s+")

In [46]:
bb.head()

Unnamed: 0,0,1,2,3
0,JAMES,3.318,3.318,1
1,JOHN,3.271,6.589,2
2,ROBERT,3.143,9.732,3
3,MICHAEL,2.629,12.361,4
4,WILLIAM,2.451,14.812,5


In [44]:
bb.loc[1,:]

0    JOHN           3.271  6.589      2
Name: 1, dtype: object

In [47]:
pd.read_csv("names.txt", sep = " ", header = None, )

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4102,4103,4104,4105,4106,4107,4108,4109,4110,4111
0,Aache,Aanwas,Aaron,Abaet,Abarden,Abbadon,Abbe,Abbo,Abe,Aberbysion,...,Woon,Worf,Wotan,Wrall,Wrathran,Wraythe,Wrothag,Wulf,Wulfgrim,Wuthmon


In [50]:
cc = bb.loc[:, 0]

In [70]:
dd = pd.DataFrame(cc.values.reshape([1, len(bb)]))

In [72]:
dd.to_csv("names2.txt", sep = " ", header = None, index = False)