In [None]:
#We need states and transition probabilities to create the markov chain, the words can be states
#We need to determine what's the probability which word comes next after the current word
#We create a bag of word and then calculate the probabilities by dividing the number of words times a word came after the previous word in the state transition diagram by the number of outgoing
#Obviously over a large enough dataset some wrods will have higher probabilities of following an existing word
#This way we can generate a whole story using a set of similar stories

In [None]:
import numpy as np
import pandas as pd
import os
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random
import pickle

In [None]:
#Fetch Dataset
import gdown
file_id = '1IgllT89j3j0_pkp4xtcadRitk2Faq9N6'
output_file = 'sciFi.zip'

gdown.download(f'https://drive.google.com/uc?id={file_id}', output_file, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1IgllT89j3j0_pkp4xtcadRitk2Faq9N6
From (redirected): https://drive.google.com/uc?id=1IgllT89j3j0_pkp4xtcadRitk2Faq9N6&confirm=t&uuid=7b569ed7-79c7-47af-8abc-11ab8b9c0b23
To: /content/sciFi.zip
100%|██████████| 58.4M/58.4M [00:01<00:00, 32.6MB/s]


'sciFi.zip'

In [None]:
!unzip sciFi.zip

Archive:  sciFi.zip
  inflating: internet_archive_scifi_v3.txt  


In [None]:
txt = []
path = '/content/internet_archive_scifi_v3.txt'
with open(path) as f:
  for line in f:
    line = line.strip()
    if line=='----':break
    if line!='':
      txt.append(line)

print(len(line))

149326360


In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
cleaned_txt = []
for line in txt:
  line = line.lower()
  line = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", line)
  tokens = word_tokenize(line)
  words = [word for word in tokens if word.isalpha()]
  cleaned_txt+=words

print(len(cleaned_txt))

26154071


In [None]:
def make_markov_model(cleaned_stories, n_gram=2):
    markov_model = {}
    for i in range(len(cleaned_stories)-n_gram-1):
        curr_state, next_state = "", ""
        for j in range(n_gram):
            curr_state += cleaned_stories[i+j] + " "
            next_state += cleaned_stories[i+j+n_gram] + " "
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1

    # calculating transition probabilities
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count/total

    return markov_model

In [None]:
markov_model = make_markov_model(cleaned_txt)

In [None]:
print("number of states = ", len(markov_model.keys()))

number of states =  4897107


In [None]:
def generate_story(markov_model, limit=100, start='my god'):
    n = 0
    curr_state = start
    next_state = None
    story = ""
    story+=curr_state+" "
    while n<limit:
        next_state = random.choices(list(markov_model[curr_state].keys()),
                                    list(markov_model[curr_state].values()))

        curr_state = next_state[0]
        story+=curr_state+" "
        n+=1
    return story

In [None]:
beginWith = input("Enter a word: ")

Enter a word: work


In [None]:
keys = list(markov_model.keys())
possibles = []
for i in keys:
  if beginWith in i:
    print(i)
    possibles.append(i)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
duenna work
workers love
works good
workers was
as workmen
workmen in
see workmen
dozen workmen
through workmen
overworking and
overworked you
workers furiously
leather worker
worker past
work successfully
working left
fireworks pinwheel
machines worked
worker five
costs worked
slix works
works seemed
paperwork would
race working
working sity
work hauling
nothing working
worked behind
perhaps working
work garrick
roosenburg working
work practically
was stonework
stonework gargoyles
gravediggers work
vernes work
screens worked
workbenches were
workbench in
earthman worked
working credo
workings marner
works plorvash
workable then
than guesswork
guesswork in
maintenance workshops
workshops clinics
workshop illuminated
for brainwork
brainwork hell
table working
working meticulously
giant patchwork
patchwork windows
work impossible
music fireworks
fireworks firewater
circus work
finish working
work pub
ancestors worked
worked

In [None]:
beginWith = random.sample(possibles,1)[0]
story = generate_story(markov_model, start=beginWith, limit=1000)
print('Story Generated Is:\n\n')
print(story)

Story Generated Is:


worked shed have to wait we may wait for the natural presence of supercooled water droplets from his face and arms and legs tingled i was first switched on was the fact that the withits the slashers but we discover just what all this up actually is another wanted their polynesian fertiles back seemed as broad as the question more than likely but had to be siding with the losers corner as we reach that place sends a delegation here its turned white not purple he was old but there was simultaneously trying to cultivate those guys mustve passed out so many of you two fellows in uniforms bending over the injured digit there must be brought forth my ring it glittered in the nine hells are we but its too late the fnool beamed cheerily major back in washington who had picked up the last time a couple of others in their cubicles in terror get him any closer muller warned to hell rionna some day ill pay you want me to do something for them just at second hand was red in th