## Writing my own Sherlock Holmes Story

Since his inception in the brilliantly creative mind of Sir Arthur Conan Doyle, Sherlock Holmes is a character who has truly stood the test of time, having been written about, spoofed, played in shows and movies by multiple actors, each with their own take on him. SInce there hasn't been a new Sherlock Holmes story in a really long time, let us bring the fictional detective to life using Neural Networks, specifically, LSTMs

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from keras.utils import np_utils
%matplotlib inline

In [2]:
fileName="The Hound of The Baskervilles.txt"
rawText=open(fileName,'r',encoding='utf-8').read()
rawText.lower()[:20]

'\ufeffthe hound of the ba'

In [3]:
chars=sorted(list(set(rawText)))
charToInt=dict((c,i) for i,c in enumerate(chars))

In [4]:
n_chars=len(rawText)
n_vocab=len(chars)
print("Total Characters:",n_chars,"\nTotal Vocab:",n_vocab)

Total Characters: 318527 
Total Vocab: 80


In [5]:
charToInt

{'\n': 0,
 ' ': 1,
 '!': 2,
 '"': 3,
 "'": 4,
 '(': 5,
 ')': 6,
 ',': 7,
 '-': 8,
 '.': 9,
 '0': 10,
 '1': 11,
 '2': 12,
 '3': 13,
 '4': 14,
 '5': 15,
 '6': 16,
 '7': 17,
 '8': 18,
 '9': 19,
 ':': 20,
 ';': 21,
 '?': 22,
 'A': 23,
 'B': 24,
 'C': 25,
 'D': 26,
 'E': 27,
 'F': 28,
 'G': 29,
 'H': 30,
 'I': 31,
 'J': 32,
 'K': 33,
 'L': 34,
 'M': 35,
 'N': 36,
 'O': 37,
 'P': 38,
 'Q': 39,
 'R': 40,
 'S': 41,
 'T': 42,
 'U': 43,
 'V': 44,
 'W': 45,
 'Y': 46,
 '[': 47,
 ']': 48,
 '_': 49,
 'a': 50,
 'b': 51,
 'c': 52,
 'd': 53,
 'e': 54,
 'f': 55,
 'g': 56,
 'h': 57,
 'i': 58,
 'j': 59,
 'k': 60,
 'l': 61,
 'm': 62,
 'n': 63,
 'o': 64,
 'p': 65,
 'q': 66,
 'r': 67,
 's': 68,
 't': 69,
 'u': 70,
 'v': 71,
 'w': 72,
 'x': 73,
 'y': 74,
 'z': 75,
 'à': 76,
 'é': 77,
 'ê': 78,
 '\ufeff': 79}

In [6]:
seqLength=100
X=[]
y=[]

for i in range(0,n_chars-seqLength):
    inSequence=rawText[i:i+seqLength]
    outSequence=rawText[i+seqLength]
    X.append([charToInt[c] for c in inSequence])
    y.append([charToInt[outSequence]])

n_patterns=len(X)
print("Total Patterns:",n_patterns)

Total Patterns: 318427


In [7]:
X[0],y[0]

([79,
  42,
  57,
  54,
  1,
  30,
  64,
  70,
  63,
  53,
  1,
  64,
  55,
  1,
  69,
  57,
  54,
  1,
  24,
  50,
  68,
  60,
  54,
  67,
  71,
  58,
  61,
  61,
  54,
  68,
  0,
  0,
  51,
  74,
  1,
  41,
  58,
  67,
  1,
  23,
  67,
  69,
  57,
  70,
  67,
  1,
  25,
  64,
  63,
  50,
  63,
  1,
  26,
  64,
  74,
  61,
  54,
  0,
  0,
  1,
  1,
  25,
  37,
  36,
  42,
  27,
  36,
  42,
  41,
  0,
  0,
  1,
  1,
  25,
  57,
  50,
  65,
  69,
  54,
  67,
  1,
  11,
  8,
  8,
  35,
  67,
  9,
  1,
  41,
  57,
  54,
  67,
  61,
  64,
  52,
  60,
  1,
  30,
  64,
  61],
 [62])

In [9]:
# X needs to be reshaped to be [samples, time_steps, features]

X=np.reshape(X,(n_patterns,seqLength,1))
X=X/float(n_vocab)
y=np_utils.to_categorical(y)

In [11]:
X[0],y[0]

(array([[0.9875],
        [0.525 ],
        [0.7125],
        [0.675 ],
        [0.0125],
        [0.375 ],
        [0.8   ],
        [0.875 ],
        [0.7875],
        [0.6625],
        [0.0125],
        [0.8   ],
        [0.6875],
        [0.0125],
        [0.8625],
        [0.7125],
        [0.675 ],
        [0.0125],
        [0.3   ],
        [0.625 ],
        [0.85  ],
        [0.75  ],
        [0.675 ],
        [0.8375],
        [0.8875],
        [0.725 ],
        [0.7625],
        [0.7625],
        [0.675 ],
        [0.85  ],
        [0.    ],
        [0.    ],
        [0.6375],
        [0.925 ],
        [0.0125],
        [0.5125],
        [0.725 ],
        [0.8375],
        [0.0125],
        [0.2875],
        [0.8375],
        [0.8625],
        [0.7125],
        [0.875 ],
        [0.8375],
        [0.0125],
        [0.3125],
        [0.8   ],
        [0.7875],
        [0.625 ],
        [0.7875],
        [0.0125],
        [0.325 ],
        [0.8   ],
        [0.925 ],
        [0