In [51]:
import random
import re
import string

## 1. Read in a file

In [19]:
def read_file(filename):
    """Parses a text file.
    """
    f = open(filename, 'r')
    f = f.readlines()
    #f.close() --?
    return f

Let's read in a file.

In [20]:
markov_input_text = read_file('/Users/Ben/Documents/tractatus.txt')

In [21]:
type(markov_input_text)

list

##2. From file, get initial chunk candidates of a given

### (a) Level
- What's our base unit? Could be words, characters, integers, etc.
- We're working with text data, and we don't want to scramble too much--so: words

### (b) Order
- *How many* members of a given level are to make up a chunk? 
- For us, order-n will progress by n-word chunks (these are the 'n-grams' Michael will be presenting on tomorrow! get psyched)

###2a. get candidates

In [99]:
def get_candidates(f):
    #f = read_file(f)
    
    candidates = ''
    while candidates == '' or candidates == '\n':
        for index, line in enumerate(f):
            
            #generate random numbers less than index + n
            uno = random.randrange(index + 2)  
            dos = random.randrange(index + 22)
            #probably not the only way to do this
            
            if uno == dos:
                candidates += line
    
    #candidates = candidates.split() #split text into chunks -- other ways to do this?
    
    print "initial candidate lines:\n", candidates
    return candidates

Re: line 7: a list is analagous to a Pandas series of indices + elements:

In [65]:
a = ['line 1','line 2','line 3']
print type(a)

for index, line in enumerate(a):
    print index, line

<type 'list'>
0 line 1
1 line 2
2 line 3


In [63]:
import pandas as pd
b = pd.Series(['line 1','line 2','line 3'])
print type(b)

print b

<class 'pandas.core.series.Series'>
0    line 1
1    line 2
2    line 3
dtype: object


Back to Markov:

Let's meet our candidates.

In [109]:
candies = get_candidates(markov_input_text)
candies

initial candidate lines:
expressed--the more the nail has been hit on the head--the greater will be
2.15 The fact that the elements of a picture are related to one another in
true, their truth could only be the result of a fortunate accident.



'expressed--the more the nail has been hit on the head--the greater will be\n2.15 The fact that the elements of a picture are related to one another in\ntrue, their truth could only be the result of a fortunate accident.\n'

###2b. split candidates str into order-1 chunks

Now let's split our candidate lines into basic, order-1 chunks, in order to arrive at a list of candidates.

In [118]:
def split_text_into_chunks(candidates):
    candidates = candidates.split()
    return candidates

In [120]:
candidates = split_text_into_chunks(candies)
candidates

['expressed--the',
 'more',
 'the',
 'nail',
 'has',
 'been',
 'hit',
 'on',
 'the',
 'head--the',
 'greater',
 'will',
 'be',
 '2.15',
 'The',
 'fact',
 'that',
 'the',
 'elements',
 'of',
 'a',
 'picture',
 'are',
 'related',
 'to',
 'one',
 'another',
 'in',
 'true,',
 'their',
 'truth',
 'could',
 'only',
 'be',
 'the',
 'result',
 'of',
 'a',
 'fortunate',
 'accident.']

- How could we resolve the double-dash issue?
- How might we split if our base unit were characters?

###2c. from candidates, select chunk at which to begin Markovized text

- Chunk is to be of appropriate order and level.

In [125]:
def select_chunk(candidates, order=1, level=False):

    text = [ ]

    start = random.randint(0, len(candidates) - order)
    chunk = candidates[start]
    
    if order == 1:
        pass
    else:
        for i in range(1, order):  
            chunk = chunk + ' ' + candidates[start + i]
    text.append(chunk) 
    print "initial chunk: ", text
    return text

In [264]:
beginning = select_chunk(candidates)
beginning

initial chunk:  ['the']


['the']

#3. Get more chunks

- From our initial chunk, `beginning`, we're going to generate a subsequent chunk of the same level and order. (3a-d)

- We'll iterate (loop) this process: new chunk generation "memorylessly" based solely on a single, directly preceding chunk. We'll stop when our chunked-together text attains a given length, returning `text` as a string. (3e)

###3a. check for a possible match

In [355]:
def check_for_possible_match(chunklist, line, order, n):
    candidate_str = ''
    if line[n] == chunklist[-1]:     
        candidate_str += line[n+1]  # keep track of chunk following latest text-chunk                  
        if order == 1:
            pass
        else:
            for i in range(2, order+1):
                candidate_str += ' ' + line[n+i]              
    return candidate_str

###3b. gather possible matches together

In [356]:
def gather_next_chunks(f, chunklist, order):
    candidates = []
    acceptable_characters = string.letters + string.digits + ' '

    for line in enumerate(f):  
        if chunklist[-1] in line[1]:  # check for a possible match 
            line = line[1].split()
            for n in range(0, len(line) - order):             
                candidate_str = check_for_possible_match(chunklist, line, order, n)
                candidate_str = filter(lambda x: x in acceptable_characters, candidate_str)                                                           
                candidates.append(candidate_str)
                    
    return candidates

###3c. filter out possible matches that are empty strings

In [357]:
def edit_chunks(candidates):
    candidates_edited = []
    for i in candidates:
        if i != '':       # empty strings had been a problem
            candidates_edited.append(i)
    return candidates_edited    

###3d. select a single next chunk from the list of possible matches

In [358]:
def pick_next_chunk(candidates, next_chunk): 
    if candidates != []:
        r = random.randint(0, len(candidates) - 1)
        next_chunk = candidates[r]
    return next_chunk

###3e. iterate through this process

In [None]:
def chunk_seq(f, text, order=1, length=50):
        """Generates a string of chuncks from an initial chunk.
        
        Input: 
        f = lines in file
        text = list containing initial chunk
        order
        length
        
        Output: 
        markov = a string, wordcount == length, of chunks
        """
        next_chunk = text[-1]
        chunklist = next_chunk.split()
        
        while len(text) * order < length:
            candidates = gather_next_chunks(f, chunklist, order) #1
            candidates_edited = edit_chunks(candidates)
            #print candidates_edited
            next_chunk = pick_next_chunk(candidates_edited, next_chunk)
            text.append(next_chunk)
        
        markov = ' '.join(text)
        return markov

- I've cheated a bit in terms of how I'm using `-1` instead of `-order`. How might we think about fixing this? At what point is it worth fixing? When might a "purer" transformation from chunk to chunk be desirable?
- what if there's only one instance of a chunk in a text?
- How might we optimize `check_for_possible_match` and `gather_next_chunks`?

#Finally...

In [265]:
chunk_seq(markov_input_text, beginning)

['thoughts', 'problems', 'reason', 'logic', 'book', 'aim', 'book', 'expression', 'limit', 'limit', 'other', 'limit', 'reason', 'thoughts', 'stimulation', 'first', 'better', 'thoughts', 'nail', 'headthe', 'accomplishment', 'taskMay', 'other', 'truth', 'thoughts', 'final', 'problems', 'second', 'of', 'case', 'totality', 'facts', 'totality', 'case', 'case', 'world', 'case', 'case', 'same', 'casea', 'existence', 'possibility', 'state', 'beginning', 'province', 'possibility', 'possibility', 'object', 'same', 'thing', 'visual', 'sense', 'possibility', 'form', 'propositions', 'substance', 'world', 'world', 'real', 'world', 'configuration', 'same', 'only', 'others', 'other', 'case', 'world', 'unalterable', 'subsistent', 'same', 'links', 'structure', 'state', 'possibility', 'structures', 'world', 'existence', 'existence', 'existence', 'world', 'existence', 'elements', 'picture', 'elements', 'picture', 'representatives', 'elements', 'structure', 'possibility', 'pictorial', 'picture', 'possibilit

'the operation result help problems one innermost totality logic bracketed two logical unsubstantial logical picture variable symbol construction fairytale other shifting key form combination situation soulthe generality whole limits understanding same general terms thought one fact general propositions former great book subject result description mark form propositions righthand world operation'

In [266]:
markov = chunk_seq(markov_input_text, beginning)
markov

'the operation result help problems one innermost totality logic bracketed two logical unsubstantial logical picture variable symbol construction fairytale other shifting key form combination situation soulthe generality whole limits understanding same general terms thought one fact general propositions former great book subject result description mark form propositions righthand world operation'

In [267]:
beginning = select_chunk(candidates)
chunk_seq(markov_input_text, beginning)

initial chunk:  ['only']
['by', 'be', 'mention', 'determine', 'by', 'distinction', 'it', 'be', 'speak', 'in', 'in', 'with', 'thing', 'the', 'one', 'point', 'they', 'substantives', 'in', 'in', 'in', 'one', 'one', 'in', 'to', 'determinate', 'what', 'from', 'possible', 'if', 'in', 'of', 'one', 'by', 'in', 'p', 'general', 'because', 'be', 'the', 'one', 'one', 'one', 'be', 'a', 'things', 'a', 'because', 'as', 'glance', 'what', 'in', 'it', 'must', 'be', 'the', 'tautologies', 'because', 'in', 'talk', 'connexions', 'with', 'by', 'a', 'necessity', 'be', 'necessity', 'impossibility', 'to', 'the', 'is', 'to', 'where', 'where', 'where', 'strictly']
['by', 'be', 'mention', 'determine', 'by', 'distinction', 'it', 'be', 'speak', 'in', 'in', 'with', 'thing', 'the', 'one', 'point', 'they', 'substantives', 'in', 'in', 'in', 'one', 'one', 'in', 'to', 'determinate', 'what', 'from', 'possible', 'if', 'in', 'of', 'one', 'by', 'in', 'p', 'general', 'because', 'be', 'the', 'one', 'one', 'one', 'be', 'a', 'thi

'only in in one be by be in distinction to to with glance where the mention be point mention p a must a impossibility because in one be impossibility in a be the where to is glance it in to point p as is one one they from the one'

##order-2

In [274]:
new_candidates = get_candidates(markov_input_text)
new_candidates = split_text_into_chunks(new_candidates)
new_beginning = select_chunk(new_candidates, order=2)
chunk_seq(markov_input_text, new_beginning, order=2)

initial candidate lines:


itself, would be illegitimate.) In a certain sense, we cannot make mistakes
5.557 The application of logic decides what elementary propositions there

initial chunk:  ['decides what']
['can be', 'cannot be', 'I have', 'is the', 'constitute this', 'subsists independently', 'is the', 'is unalterable', 'is changing', 'it depicts', 'it depicts', 'it depicts', 'this means', 'made it', 'they are', 'is signified', 'can be', 'would be', 'is superficially', 'is essential', 'all propositions', 'all symbols', 'is common', 'is common', 'its meaning', 'the logic', 'they represent', 'they are', 'constitutes the', 'is the', 'we now', 'was being', 'they signify', 'Pp signified', 'circumstances I', 'black and', 'is affirmed', 'is negated', 'can be', 'cannot be', 'can be', 'cannot be', 'Frege and', 'the schemata', 'they say', 'is essential', 'is not', 'is essential', 'propositions I', 'is known', 'has to', 'the bases', 'p said', 'one might', 'all propositions', 'is common', 'i

'decides what there must black and is affirmed can be cannot be is essential the logic is higher the net the net I have is negated they are its meaning made it is not we do the logic the law is common it depicts can be the net is changing'

#order-3

In [275]:
newer_candidates = get_candidates(markov_input_text)
newer_candidates = split_text_into_chunks(newer_candidates)
newer_beginning = select_chunk(newer_candidates, order=3)
chunk_seq(markov_input_text, newer_beginning, order=3)

initial candidate lines:

I am not mistaken in this belief, then the second thing in which the of
follow from them come true. And similarly he could not create a world in

initial chunk:  ['not mistaken in']
['itor at least', 'order to be', 'language that the', 'two things the', 'it and on', 'this belief then', 'which the of', 'logical space are', 'a state of', 'states of affairs', 'the province of', 'states of affairs', 'so far as', 'two different roles', 'a space of', 'infinite space A', 'the visual field', 'states of affairs', 'common in which', 'which case it', 'a determinate relation', 'which objects are', 'a state of', 'logical space the', 'a determinate way', 'the same way', 'this way also', 'a picture and', 'common with reality', 'order to be', 'common with reality', 'order to be', 'common with what', 'language anything that', 'geometry to represent', 'its projective relation', 'its elements the', 'a determinate relation', 'writing or print', 'a printed proposition', 'a certain

'not mistaken in a proposition All the logic of the same way which certain propositions which they have opposite directions to no way justifies this way he a certain relation philosophy again and language anything that fact significant that language language cannot front of fxfor common in which itself the whole'

#order-4

In [276]:
newest_candidates = get_candidates(markov_input_text)
newest_candidates = split_text_into_chunks(newest_candidates)
newest_beginning = select_chunk(newest_candidates, order=4)
chunk_seq(markov_input_text, newest_beginning, order=4)

initial candidate lines:
another.
picture of our speech. And yet these sign-languages prove to be pictures,

5.47 It is clear that whatever we can say in advance about the form of all

initial chunk:  ['sign-languages prove to be']
['understood only by someone', 'said at all can', 'said clearly and what', 'able to think what', 'in language that the', 'the case or not', 'a sort of accident', 'situated in infinite space', 'red must have some', 'resolved into a statement', 'objects if the world', 'a picture it must', 'something identical in a', 'a picture of the', 'able to depict itcorrectly', 'of anything illogical since', 'said that God could', 'contrary to the laws', 'represented by us spatially', 'described but not given', 'expressed in such a', 'named Signs are their', 'possible is the requirement', 'right or wrong A', 'nonsensical if the complex', 'seen from an indeterminateness', 'dissected any further by', 'anatomized by means of', 'explained by means of', 'constant and everything

'sign-languages prove to be quite possible to choose bed a feature of that which makes it bed a feature of red must have some gathered only from the seen from the two the following to say able to depict itcorrectly a picture of the a logic even if a picture of the'

Testing

In [354]:
test_candidates = get_candidates(markov_input_text)
test_candidates = split_text_into_chunks(test_candidates)
test_beginning = select_chunk(test_candidates, order=5)
#print type(test_beginning)
test = chunk_seq(markov_input_text, test_beginning, order=5)
print test

initial candidate lines:

outward form of the clothing is not designed to reveal the form of the

5.251 A function cannot be its own argument, whereas an operation can take

initial chunk:  ['is not designed to reveal']
is not designed to reveal is not designed to reveal is not designed to reveal is not designed to reveal is not designed to reveal is not designed to reveal is not designed to reveal is not designed to reveal is not designed to reveal is not designed to reveal


Original (way way way too long) chunk_seq function

In [None]:
def chunk_seq(f, text, order=1, length=50):
        """Generates a string of chuncks from an initial chunk.
        
        Input: 
        f = lines in file
        text = list containing initial chunk
        order
        length
        
        Output: 
        markov = a string, wordcount == length, of chunks
        """
        next_chunk = text[-1]
        chunklist = next_chunk.split()
        #text = text.split()
        acceptable_characters = string.letters + string.digits + ' '

        while len(text) * order < length:
            
            ########### I. Gather possible next chunks ###########
            candidates = []
            for line in enumerate(f):   
                if chunklist[-1] in line[1]:  # check for a possible match 
                    line = line[1].split()
                    
                    
                    for n in range(0, len(line) - order):
                        candidate_str = ''
                        if line[n] == chunklist[-1]:     
                            candidate_str += line[n+1]  # keep track of chunk following latest text-chunk                  
                            if order == 1:
                                pass
                            else:
                                for i in range(2, order+1):
                                    candidate_str += ' ' + line[n+i]              
                                    
                        candidate_str = filter(lambda x: x in acceptable_characters, candidate_str)                                                           
                        candidates.append(candidate_str)
                        
            
            ########### II. Edit chunks ###########
            candidates_edited = []
            for i in candidates:
                if i != '':       # empty strings had been a problem
                    candidates_edited.append(i)
            print candidates_edited
            
             
            ########### III. Pick next chunk ###########
            #if candidates_edited == []:
                
                #next_chunk = text[-order:] #if no future instances, we are to append last order in previous 
            if candidates_edited != []:
                r = random.randint(0, len(candidates_edited) - 1)
                next_chunk = candidates_edited[r]
            text.append(next_chunk)
                #print next_chunk
            
                
        markov = ' '.join(text)
        return markov