## Combine Trump tweets and Bible Verses in a Markov Model and generate random text

In [1]:
import re
import pandas as pd
import numpy as np
import sys
import scipy.sparse as sp
from sklearn.preprocessing import normalize
import pickle
import random
import math

### Preprocess

In [2]:
# Read in the bible
with open("bible.txt", "r") as bible:
    bible_text = bible.read()
    bible.close()
    
# Regular expression to consolidate verses into single lines
single_line = re.compile("\n(?!\n)")
bible_text = re.sub(pattern=single_line, string=bible_text, repl=" ")

In [3]:
# Read in the trump tweets
tweets = pd.read_csv("trumpTweets.csv")
tweets_text = tweets["text"]
del tweets

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# Append tweets to Bible
bible_and_tweets = bible_text + "\n".join(tweets_text)
with open("bible_and_tweets.txt", "w+", encoding="utf-8") as file:
    file.write(bible_and_tweets)
    file.close()
del bible_text, bible_and_tweets

In [5]:
# Read back in the results
with open("bible_and_tweets.txt", "r", encoding="utf-8") as file:
    bible_and_tweets = file.read()
    file.close()

In [6]:
# Use regular expression to edit data
remove_non_alphabetic = re.compile("[^a-zA-Z \n]")
remove_multiple_newlines = re.compile("\n[\n]+")
remove_multiple_spaces = re.compile(" [ ]+")
remove_http = re.compile("http[^ ]+")
bible_and_tweets = re.sub(pattern=remove_non_alphabetic, string=bible_and_tweets, repl="")
bible_and_tweets = re.sub(pattern=remove_multiple_newlines, string=bible_and_tweets, repl="\n")
bible_and_tweets = re.sub(pattern=remove_http, string=bible_and_tweets, repl=" ")

In [7]:
bible_and_tweets = bible_and_tweets.replace("\n", " ENDLINE\n")
bible_and_tweets = re.sub(pattern=remove_multiple_spaces, string=bible_and_tweets, repl=" ")

In [8]:
bible_and_tweets = bible_and_tweets.lower()
print(bible_and_tweets[0:1000])
print(bible_and_tweets[5000000:5001000])

the old testament of the king james version of the bible endline
 the first book of moses called genesis endline
 in the beginning god created the heavens and the earth endline
 and the earth was without form and void and darkness was upon the face of the deep and the spirit of god moved upon the face of the waters endline
 and god said let there be light and there was light endline
 and god saw the light that it was good and god divided the light from the darkness endline
 and god called the light day and the darkness he called night and the evening and the morning were the first day endline
 and god said let there be a firmament in the midst of the waters and let it divide the waters from the waters endline
 and god made the firmament and divided the waters which were under the firmament from the waters which were above the firmament and it was so endline
 and god called the firmament heaven and the evening and the morning were the second day endline
 and god said let the waters unde

In [9]:
# Trim each line, write result back to file
sentences = []
for line in bible_and_tweets.split("endline\n"):
    sentences.append(line.strip())

with open("text_processed.txt", "w+") as file:
    file.write(" endline\n".join(sentences))
    file.close()

### Create vocabulary and transition matrix

In [10]:
# Read in the full text
with open("text_processed.txt", "r") as file:
    text_full = file.read()
    file.close()
print(len(text_full))

7598559


In [11]:
# Create the vocabulary
vocab = []
for line in text_full.split("\n"):
    for word in line.split(" "):
        vocab.append(word)
        
print(len(vocab))

1366285


In [12]:
# Create a set for the vocab
vocab = set(vocab)
print(len(vocab))

43714


In [13]:
# Create a dict mapping vocab to index
vocab_to_id = {}
id_to_vocab = {}
for index, word in enumerate(vocab):
    vocab_to_id[word] = index
    id_to_vocab[index] = word
    
# Save the objects
with open('vocab_to_id.pkl', 'wb') as f:
    pickle.dump(vocab_to_id, f, pickle.HIGHEST_PROTOCOL)
    f.close()

with open('id_to_vocab.pkl', 'wb') as f:
    pickle.dump(id_to_vocab, f, pickle.HIGHEST_PROTOCOL)
    f.close()

with open('vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f, pickle.HIGHEST_PROTOCOL)
    f.close()

In [14]:
# Create an empty matrix of zeroes
tf = np.zeros((len(vocab), len(vocab)))
print(type(tf))

<class 'numpy.ndarray'>


In [15]:
# Fill up the tf matrix
for line in text_full.split("\n"):
    words = line.split(" ")
    for i in range(len(words) - 1):
        id1 = vocab_to_id[words[i]]
        id2 = vocab_to_id[words[i + 1]]
        tf[id1, id2] += 1
del text_full

In [16]:
print(sys.getsizeof(tf))
mat = sp.coo_matrix(tf)
print(sys.getsizeof(mat))
del tf

15287310480
56


In [17]:
print(np.argmax(mat.data))
print(mat.row[37577])
print(mat.col[37577])

176561
4926
37980


In [19]:
# Find the k most common word pairings
k = 20
elems = mat.data.copy()
elems.sort()
for elem in elems[-k:]:
    loc = np.where(mat.data == elem)
    id1 = mat.row[loc[0][0]]
    id2 = mat.col[loc[0][0]]
    print(str(id_to_vocab[id1]) + " " + id_to_vocab[id2] )

the son
son of
on the
said unto
thank you
will be
of israel
out of
unto the
and they
all the
shall be
for the
i will
and he
to the
in the
and the
the lord
of the


The most common consectuive pair of words, with 13245 instances, was "of the"

In [20]:
# Normalize the matrix
mat_norm = normalize(mat, norm='l1', axis=1)

In [21]:
# Save the normalized matrix
sp.save_npz("norm_matrix.npz", mat_norm)
del mat, mat_norm

### Actually make random sentences

In [4]:
# Read back in the matrix and dictionary objects
mat_norm = sp.load_npz("norm_matrix.npz")
with open('vocab_to_id.pkl', 'rb') as f:
    vocab_to_id = pickle.load(f)
    f.close()
with open('id_to_vocab.pkl', 'rb') as f:
    id_to_vocab = pickle.load(f)
    f.close()
with open('vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)
    f.close()

In [6]:
start_word = "hillary"
if start_word not in vocab:
    print("Invalid start word")

sentence = start_word
while start_word != "endline":
    row_ind = vocab_to_id[start_word]
    prob_dist = np.array(mat_norm.getrow(row_ind).todense())[0]
    next_ind = np.random.choice(range(len(vocab)), p=prob_dist)
    start_word = id_to_vocab[next_ind]
    sentence += " " + start_word
print(sentence)

hillary is for his cupbearers and asahel and advertising desperate endline


## Results

### start word: 'the'

* the nations shall many women working on the people when israel which are known loser who hath sent benaiah the white as i hope of macedonia and invent to the appearance they not nor taking over theaters this is mccarthyism
* the original costume was susan berry
* the midst of seventy years agothis is mad sometimes referred to be long massive tax just named eutychus being stubborn can call it
* the bar them that thou with his works nay but you have sinned against the lord

### start word: 'who'
* who escaped alone and treasuries shelemiah shemariah and moab and this country is the one came up he defrauded us is through phenice and thou shalt not hearken unto aaron the obamacare
* who was a son of them thou shalt thou art not the word again despite obamas terrible

### start word: "hillary"
* hillary lying vanities but the goodness thou shalt hearken ye that were in all this day in twelve hours left to go way but of the lord of israel and her head
* hillary off my desire to trump in her shall surely pay attentionand good night and my mother has zero hours of the lord sent ishmael the stranger for the iconic old article in nyc deal w robot rubio sat there amp me and have our hand and the media panel we need a lot of the apprentice no more interesting poll thank you tomorrow

## Adding a "memory" component to the analysis

We are going to increase the look back of our analysis to include not a single word, but the last 3 words.

In [27]:
# Specify the starting sentence
start_words = "i am the".split(" ")

# Specify the choice of weigts
weight_choice = "random"
gamma = 0.5
if weight_choice == "uniform":
    weights = np.zeros(len(start_words)) + 1
    weights = weights / len(start_words)
elif weight_choice == "decaying":
    weights = [0] * len(start_words)
    for i in range(len(start_words)):
        weights[i] = math.pow(gamma, i)

    weights.reverse()
    weights = np.array(weights)/sum(weights)
else:
    weights = np.array(random.sample(range(0,100), len(start_words)))
    weights = weights / sum(weights)

for word in start_words:
    if word not in vocab:
        print("Invalid vocab word")
     
current_word = start_words[-1]

# Number of steps to look back
lookback = len(start_words)

while current_word != "endline":
    # Get the last "n" words, where "n" is the lookback amount
    lookback_words = start_words[-(lookback):]
    
    # Start with the furthest back word and use it's distribution as the start
    row_ind = vocab_to_id[start_words[-len(start_words)]]
    prob_dist = weights[0] * np.array(mat_norm.getrow(row_ind).todense())[0]
    
    # For all the rest of the words, add the weighted probability distribution to the result
    for i, word in enumerate(lookback_words[1:]):
        row_ind = vocab_to_id[word]
        prob_dist += weights[i + 1] * np.array(mat_norm.getrow(row_ind).todense())[0]
    
    # Make sure we don't have repeats
    current_word = start_words[-1]
    while current_word in lookback_words:
        next_ind = np.random.choice(range(len(vocab)), p=prob_dist)
        current_word = id_to_vocab[next_ind]
    
    # Append the predicted word the results
    start_words.append(current_word)
    
print(" ".join(start_words[:-1]))

i am the three will vote was able to shur and never have consulted with will also have to comprehend


## Results

### start words "donald trump is":

#### weight: decaying

* donald trump is thinking an big guy leads to finally someone realize you for years the holy one that o trump house thou shalt j trump jeb used the wilderness a trump you believe

* donald trump is u the re tax their credit trump for

#### weight: uniform

* donald trump is at the trump president j jimmy trumps son

* donald trump is we are ye might j trump when ill be trump only run the trump house youll j trump you your words money trump no shepherd he of the your arguing enemies shall be take great thx

#### weight: random

* donald trump is have you i run donald trump international unlike anything worth trump

* donald trump is now hear the children pot of trump for

### "thou shalt not":

#### weight: decaying

* though shalt not bless the king hand of illegals to bring ye another will draw

* though shalt not sincerely increase supposing the world they say again thou being wilt twelve men they cause some i of am the best interviews and delaying his the bramble and abigail the nabals wife looked bare all my lips soul

#### weight: uniform

* though shalt not i in will he i is want to big we i too he turned a their way kings counsel

* though shalt not save thou shalt gleaned he they that thou me art and i will cast they sank i the have jesus remained u debate would have love they the every tea party i had prayed brought my he i that despiseth atlantic thou make livest these things ten list the on foxandfriends this great i feel say the i will save bless alive thee he the rested themselves what in greensboro your north it country charles he missouri hath trump you they a lacked mother opportunity in any case i disgraceful always we on actually the prophets and im love from

#### weight: random

* though shalt not go use twitter trump

* though shalt not he then let us the freeing super up tuesday morning on television a it is not enough repeal signatures fools big league crowd thou shalt love roast with and an i emotional apprenticenbc still her daughters times fools in feedeth thy among thorns all thy the matters who dennis opposes nd choice worst ye thing say who unto them absalom

### "i am the":

#### weight: decaying

* i am the heavenly have forsaken covenanted with no a samaritan as that trust forsake ye begin to the ships asher and they went in out time gave an iraq will it appoint is you trump wins will written heal like neither can could only in truth not their healthcare would plan will be totally proved did are change bring back will i need your to hand pretending of not spare them look and thou he wonder of where to hear used the win will went and he testifieth thou disquieted shalt win

* i am the have young havent did was being offered merciful and man feareth have god and with thee o hosted by andrewejohnson

#### weight: uniform

* i am the ephraimites best which was great

* i am the never have smote i them could not try cannot to get trump it am persuaded the law kindred for yourselves therefore said thus love congratulate have him a full of would a prenup

#### weight: random

* i am the pray will say rebuke am will might say think always made for sat follow gave wonder will love have know just shall was see have just scatter will say looked gave have i said am testified therefore let have unto am may understanding hope have will hate say said daughter shall did have wish love cant let that say go may take am will should dont wrote rejoice will am sent thank will delivered came run should say will am tell his pray saw am will pray know cannot am have had love appointed was will agree find pray go loved completely am agree said had might truly see send will am watched have know am wonder cut will want have shall again would tell will vote need play will darius totally speak may beheld them hope will am

* i am the three will vote was able to shur and never have consulted with will also have to comprehend