In [1]:
import numpy as np
import pandas as pd
import os
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random

In [None]:
# Reading every Sherlock Holmes adventure!

In [58]:
import os  # Import the os module to work with file paths

# Directory path where Sherlock Holmes stories are stored
story_path = "/home/arti/Downloads/archive/sherlock/sherlock"

# Function to read all Sherlock Holmes stories from the given directory
def read_all_stories(story_path):
    txt = []  # Initialize an empty list to store lines from the stories
    try:
        # Walk through the directory and its subdirectories
        for _, _, files in os.walk(story_path):
            for file in files:
                # Open each file in the directory
                with open(os.path.join(story_path, file), 'r') as f:
                    # Read each line in the file
                    for line in f:
                        line = line.strip()  # Remove leading and trailing whitespace
                        # Stop reading if the line is "----------"
                        if line == '----------':
                            break
                        # Add non-empty lines to the list
                        if line != '':
                            txt.append(line)
    except Exception as e:
        print("Error:", e)  # Print any errors that occur during file reading
    return txt  # Return the list of lines containing all the stories

# Call the function to read all stories and store them in the 'stories' variable
stories = read_all_stories(story_path)
#print(stories)
# Print the total number of lines collected
print("Number of lines =", len(stories))


Number of lines = 215021


In [None]:
#Cleaning the text

In [57]:
import re  # Import the regular expression module
from nltk.tokenize import word_tokenize  # Import word_tokenize from NLTK

# Function to clean text data
def clean_txt(txt):
    cleaned_txt = []  # Initialize an empty list to store cleaned words
    for line in txt:
        line = line.lower()  # Convert the line to lowercase
        line = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", line)  # Remove punctuation
        tokens = word_tokenize(line)  # Tokenize the line into words
        words = [word for word in tokens if word.isalpha()]  # Remove non-alphabetic tokens
        cleaned_txt += words  # Add the cleaned words to the list
    return cleaned_txt  # Return the list of cleaned words

# Clean the stories
cleaned_stories = clean_txt(stories)

# Print the total number of words in the cleaned stories
print("Number of words =", len(cleaned_stories))
#print(cleaned_stories)

Number of words = 2332247


In [None]:
## Creating the Markov Model¶

In [59]:
def make_markov_model(cleaned_stories, n_gram=2):
    # Initialize an empty dictionary to store the Markov model
    markov_model = {}
    
    # Iterate over the cleaned words, excluding the last n_gram + 1 words
    for i in range(len(cleaned_stories) - n_gram):
        # Initialize empty strings for the current state and next state
        curr_state, next_state = "", ""
        
        # Build the current state and next state by concatenating words
        for j in range(n_gram):
            curr_state += cleaned_stories[i + j] + " "
            next_state += cleaned_stories[i + j + 1] + " "
        
        # Remove trailing whitespace
        curr_state = curr_state.strip()
        next_state = next_state.strip()
        
        # If the current state is not in the Markov model, add it
        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            # If the current state exists, update transition counts
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1
    
    # Calculate transition probabilities
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count / total
        
    return markov_model


In [60]:
markov_model = make_markov_model(cleaned_stories)

In [61]:
print("number of states = ", len(markov_model.keys()))

number of states =  208716


In [62]:
print("All possible transitions from 'the game' state: \n")
print(markov_model['the game'])


All possible transitions from 'the game' state: 

{'game would': 0.036036036036036036, 'game is': 0.1891891891891892, 'game was': 0.21621621621621623, 'game in': 0.07207207207207207, 'game the': 0.036036036036036036, 'game for': 0.0990990990990991, 'game may': 0.02702702702702703, 'game now': 0.05405405405405406, 'game my': 0.05405405405405406, 'game at': 0.02702702702702703, 'game mr': 0.02702702702702703, 'game ay': 0.02702702702702703, 'game fairly': 0.02702702702702703, 'game worth': 0.02702702702702703, 'game you': 0.02702702702702703, 'game i': 0.02702702702702703, 'game your': 0.02702702702702703}


In [64]:
def generate_story(markov_model, limit=100, start='my god'):
    n = 0
    curr_state = start
    next_state = None
    story = ""
    story+=curr_state+" "
    while n<limit:
        next_state = random.choices(list(markov_model[curr_state].keys()),
                                    list(markov_model[curr_state].values()))
        
        curr_state = next_state[0]
        story+=curr_state+" "
        n+=1
    return story


In [71]:
for i in range(10):
    print(str(i)+". ", generate_story(markov_model, start="dear holmes", limit=8))


0.  dear holmes holmes my my word word of of what what you you are are a a criminal 
1.  dear holmes holmes what what is is the the slackest slackest time time in in the the mud 
2.  dear holmes holmes you you have have really really some some huge huge bat bat glued glued against 
3.  dear holmes holmes if if your your conscience conscience will will be be arrested arrested and and tried 
4.  dear holmes holmes you you had had my my attention attention was was given given to to man 
5.  dear holmes holmes if if they they were were allowed allowed to to go go about about alone 
6.  dear holmes holmes said said i i surely surely you you dont dont associate associate his his past 
7.  dear holmes holmes that that i i could could not not sleep sleep for for excitement excitement and 
8.  dear holmes holmes i i really really have have done done me me the the whole whole matter 
9.  dear holmes holmes i i have have heard heard of of any any kind kind will will receive 


In [73]:
for i in range(20):
    print(str(i)+". ", generate_story(markov_model, start="dear wife", limit=8))


0.  dear wife wife knew knew that that the the lady lady to to london london he he would 
1.  dear wife wife died died young young she she is is dead dead he he could could no 
2.  dear wife wife died died i i felt felt that that there there is is something something on 
3.  dear wife wife knew knew no no sir sir he he does does it it once once more 
4.  dear wife wife knew knew that that so so far far we we are are egria egria it 
5.  dear wife wife died died i i saved saved her her no no doubt doubt its its value 
6.  dear wife wife died died i i saved saved them them trouble trouble said said holmes holmes carelessly 
7.  dear wife wife died died i i felt felt the the room room below below it it was 
8.  dear wife wife knew knew that that she she heard heard him him say say and and do 
9.  dear wife wife knew knew no no doubt doubt said said i i oh oh i i am 
10.  dear wife wife knew knew that that it it was was the the fair fair sex sex is 
11.  dear wife wife knew knew that that n

In [74]:
for i in range(10):
    print(str(i)+". ", generate_story(markov_model, start="i would", limit=8))


0.  i would would do do nothing nothing and and no no man man upon upon the the shelf 
1.  i would would go go round round the the table table in in front front of of him 
2.  i would would send send you you word word your your inquiries inquiries have have run run a 
3.  i would would swear swear to to you you much much his his only only hope hope of 
4.  i would would there there be be in in time time for for us us to to something 
5.  i would would rather rather have have tobys tobys help help than than that that of of her 
6.  i would would test test it it to to your your friend friend mr mr barker barker arrived 
7.  i would would have have come come as as to to any any very very pressing pressing which 
8.  i would would only only change change was was made made by by the the way way excellent 
9.  i would would ask ask me me to to leave leave him him harmless harmless for for the 


In [77]:
print(generate_story(markov_model, start="the case", limit=50))


the case case before before you you laid laid him him down down to to me me on on your your side side will will not not figure figure in in a a pitiable pitiable state state of of mind mind keen keen in in his his eyes eyes fixed fixed on on my my threshold threshold his his appearance appearance marred marred only only by by the the coroners coroners inquiry inquiry is is still still within within then then i i take take it it as as a a workman workman looking looking for for a a round round table 
