## Combine Trump tweets and Bible Verses in a Markov Model and generate random text

In [114]:
import re
import pandas as pd
import numpy as np
import sys
import scipy.sparse as sp
from sklearn.preprocessing import normalize
import pickle

### Preprocess

In [9]:
# Read in the bible
with open("bible.txt", "r") as bible:
    bible_text = bible.read()
    bible.close()
    
# Regular expression to consolidate verses into single lines
single_line = re.compile("\n(?!\n)")
bible_text = re.sub(pattern=single_line, string=bible_text, repl=" ")

The Old Testament of the King James Version of the Bible



 The First Book of Moses:  Called Genesis

 1:1 In the beginning God created the heavens and the earth.
 1:2 And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.
 1:3 And God said, Let there be light: and there was light.
 1:4 And God saw the light, that it was good: and God divided the light from the darkness.
 1:5 And God called the light Day, and the darkness he called Night. And the evening and the morning were the first day.
 1:6 And God said, Let there be a firmament in the midst of the waters, and let it divide the waters from the waters.
 1:7 And God made the firmament, and divided the waters which were under the firmament from the waters which were above the firmament: and it was so.
 1:8 And God called the firmament Heaven. And the evening and the morning were the second day.
 1:9 And God said, Let the waters under the heaven be

In [12]:
# Read in the trump tweets
tweets = pd.read_csv("trumpTweets.csv")
tweets_text = tweets["text"]
del tweets

  interactivity=interactivity, compiler=compiler, result=result)


In [15]:
# Append tweets to Bible
bible_and_tweets = bible_text + "\n".join(tweets_text)
with open("bible_and_tweets.txt", "w+", encoding="utf-8") as file:
    file.write(bible_and_tweets)
    file.close()
del bible_text, bible_and_tweets

In [35]:
# Read back in the results
with open("bible_and_tweets.txt", "r", encoding="utf-8") as file:
    bible_and_tweets = file.read()
    file.close()

In [36]:
# Use regular expression to edit data
remove_non_alphabetic = re.compile("[^a-zA-Z \n]")
remove_multiple_newlines = re.compile("\n[\n]+")
remove_multiple_spaces = re.compile(" [ ]+")
remove_http = re.compile("http[^ ]+")
bible_and_tweets = re.sub(pattern=remove_non_alphabetic, string=bible_and_tweets, repl="")
bible_and_tweets = re.sub(pattern=remove_multiple_newlines, string=bible_and_tweets, repl="\n")
bible_and_tweets = re.sub(pattern=remove_http, string=bible_and_tweets, repl=" ")

In [37]:
bible_and_tweets = bible_and_tweets.replace("\n", " ENDLINE\n")
bible_and_tweets = re.sub(pattern=remove_multiple_spaces, string=bible_and_tweets, repl=" ")

In [39]:
bible_and_tweets = bible_and_tweets.lower()
print(bible_and_tweets[0:1000])
print(bible_and_tweets[5000000:5001000])

the old testament of the king james version of the bible endline
 the first book of moses called genesis endline
 in the beginning god created the heavens and the earth endline
 and the earth was without form and void and darkness was upon the face of the deep and the spirit of god moved upon the face of the waters endline
 and god said let there be light and there was light endline
 and god saw the light that it was good and god divided the light from the darkness endline
 and god called the light day and the darkness he called night and the evening and the morning were the first day endline
 and god said let there be a firmament in the midst of the waters and let it divide the waters from the waters endline
 and god made the firmament and divided the waters which were under the firmament from the waters which were above the firmament and it was so endline
 and god called the firmament heaven and the evening and the morning were the second day endline
 and god said let the waters unde

In [41]:
# Trim each line, write result back to file
sentences = []
for line in bible_and_tweets.split("endline\n"):
    sentences.append(line.strip())

with open("text_processed.txt", "w+") as file:
    file.write(" endline\n".join(sentences))
    file.close()

### Create vocabulary and transition matrix

In [55]:
# Read in the full text
with open("text_processed.txt", "r") as file:
    text_full = file.read()
    file.close()
print(len(text_full))

7598559


In [56]:
# Create the vocabulary
vocab = []
for line in text_full.split("\n"):
    for word in line.split(" "):
        vocab.append(word)
        
print(len(vocab))

1366285


In [57]:
# Create a set for the vocab
vocab = set(vocab)
print(len(vocab))

43714


In [115]:
# Create a dict mapping vocab to index
vocab_to_id = {}
id_to_vocab = {}
for index, word in enumerate(vocab):
    vocab_to_id[word] = index
    id_to_vocab[index] = word
    
# Save the objects
with open('vocab_to_id.pkl', 'wb') as f:
    pickle.dump(vocab_to_id, f, pickle.HIGHEST_PROTOCOL)

with open('id_to_vocab.pkl', 'wb') as f:
    pickle.dump(id_to_vocab, f, pickle.HIGHEST_PROTOCOL)

In [59]:
# Create an empty matrix of zeroes
tf = np.zeros((len(vocab), len(vocab)))
print(type(tf))

<class 'numpy.ndarray'>


In [60]:
# Fill up the tf matrix
for line in text_full.split("\n"):
    words = line.split(" ")
    for i in range(len(words) - 1):
        id1 = vocab_to_id[words[i]]
        id2 = vocab_to_id[words[i + 1]]
        tf[id1, id2] += 1
del text_full

In [61]:
print(sys.getsizeof(tf))
mat = sp.coo_matrix(tf)
print(sys.getsizeof(mat))
del tf

15287310480
56


In [62]:
print(np.argmax(mat.data))
print(mat.row[37577])
print(mat.col[37577])

37577
4716
7159


In [78]:
# Find the k most common word pairings
k = 20
elems = mat.data.copy()
elems.sort()
for elem in elems[-k:]:
    loc = np.where(mat.data == elem)
    id1 = mat.row[loc[0][0]]
    id2 = mat.col[loc[0][0]]
    print(str(id_to_vocab[id1]) + " " + id_to_vocab[id2] )

the son
son of
on the
said unto
thank you
will be
of israel
out of
unto the
and they
all the
shall be
for the
i will
and he
to the
in the
and the
the lord
of the


The most common consectuive pair of words, with 13245 instances, was "of the"

In [76]:
# Normalize the matrix
mat_norm = normalize(mat, norm='l1', axis=1)

In [80]:
# Save the normalized matrix
sp.save_npz("norm_matrix.npz", mat_norm)
del mat, mat_norm

### Actually make random sentences

In [117]:
# Read back in the matrix and dictionary objects
mat_norm = sp.load_npz("norm_matrix.npz")
with open('vocab_to_id.pkl', 'rb') as f:
    pickle.load(f)
with open('id_to_vocab.pkl', 'rb') as f:
    pickle.load(f)

In [120]:
start_word = "who"
if start_word not in vocab:
    print("Invalid start word")

sentence = start_word
while start_word != "endline":
    row_ind = vocab_to_id[start_word]
    prob_dist = np.array(mat_norm.getrow(row_ind).todense())[0]
    next_ind = np.random.choice(range(len(vocab)), p=prob_dist)
    start_word = id_to_vocab[next_ind]
    sentence += " " + start_word
print(sentence)

who was a son of them thou shalt thou art not the word again despite obamas terrible endline


## Results

### start word: 'the'

* the nations shall many women working on the people when israel which are known loser who hath sent benaiah the white as i hope of macedonia and invent to the appearance they not nor taking over theaters this is mccarthyism
* the original costume was susan berry
* the midst of seventy years agothis is mad sometimes referred to be long massive tax just named eutychus being stubborn can call it
* the bar them that thou with his works nay but you have sinned against the lord

### start word: 'who'
* who escaped alone and treasuries shelemiah shemariah and moab and this country is the one came up he defrauded us is through phenice and thou shalt not hearken unto aaron the obamacare
* who was a son of them thou shalt thou art not the word again despite obamas terrible

In [102]:
mat_norm.getrow(37577)

<1x43714 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>