<a href="https://colab.research.google.com/github/bhuvanikavijay/prodigy-infotech-task-1/blob/main/prodigy_infotech_task_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'sherlock-holmes-stories:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5637%2F8412%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240813%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240813T061255Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D035690d0d134240aa23dcec8655eb534e01aa28ecb5df1fd23d60a963da039e0cba92f2caf2cd4c9561940f244b241a44da355f28156ae0333c66a01ea8c80ba50c8fb9c7c29e50683df609a75a024d41b2058e2a7e7b9ae9b76cfb612669b08bac395cf088001129eba6cb9b59ff54f2eb23af01992a2e9a7d4a9efbdd6301cbee054b8d0cf82c2e6aed36d446b2b2cdddd083a3f9d2b31c9ab9e90463d64a9629db916b416fff71b406f0ff5c96e7d549d03ce8b09a9444ce02d3b81d015c8073286788033488ef53db28dd902b0a8b5bba9d89cb5f6d36dbb65e40be1a6b81433f934d70effe4957dc8c7ca40816c76963283687552c4e3893e2c5c1eb1e2'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading sherlock-holmes-stories, 10414294 bytes compressed
Downloaded and uncompressed: sherlock-holmes-stories
Data source import complete.


In [None]:
!pip install nltk
import nltk
nltk.download('punkt')

import numpy as np
import pandas as pd
import os
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
story_path = "/kaggle/input/sherlock-holmes-stories/sherlock/sherlock/"

def read_all_stories(story_path):
    txt = []
    for _, _, files in os.walk(story_path):
        for file in files:
            with open(story_path+file) as f:
                for line in f:
                    line = line.strip()
                    if line=='----------': break
                    if line!='':txt.append(line)
    return txt

stories = read_all_stories(story_path)
print("number of lines = ", len(stories))

number of lines =  215021


In [None]:
def clean_txt(txt):
    cleaned_txt = []
    for line in txt:
        line = line.lower()
        line = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", line)
        tokens = word_tokenize(line)
        words = [word for word in tokens if word.isalpha()]
        cleaned_txt+=words
    return cleaned_txt

cleaned_stories = clean_txt(stories)
print("number of words = ", len(cleaned_stories))

number of words =  2332247


In [None]:
def make_markov_model(cleaned_stories, n_gram=2):
    markov_model = {}
    for i in range(len(cleaned_stories)-n_gram-1):
        curr_state, next_state = "", ""
        for j in range(n_gram):
            curr_state += cleaned_stories[i+j] + " "
            next_state += cleaned_stories[i+j+n_gram] + " "
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1

    # calculating transition probabilities
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count/total

    return markov_model

In [None]:
markov_model = make_markov_model(cleaned_stories)

In [None]:
print("number of states = ", len(markov_model.keys()))

number of states =  208716


In [None]:
print("All possible transitions from 'the game' state: \n")
print(markov_model['the game'])

All possible transitions from 'the game' state: 

{'in their': 0.036036036036036036, 'in that': 0.036036036036036036, 'the lack': 0.036036036036036036, 'for all': 0.06306306306306306, 'was afoot': 0.036036036036036036, 'your letter': 0.02702702702702703, 'may wander': 0.02702702702702703, 'now a': 0.02702702702702703, 'was up': 0.09009009009009009, 'for the': 0.036036036036036036, 'was in': 0.02702702702702703, 'is hardly': 0.02702702702702703, 'would have': 0.036036036036036036, 'is up': 0.06306306306306306, 'is and': 0.036036036036036036, 'was whist': 0.036036036036036036, 'is afoot': 0.036036036036036036, 'my own': 0.02702702702702703, 'at any': 0.02702702702702703, 'mr holmes': 0.02702702702702703, 'ay whats': 0.02702702702702703, 'my friend': 0.02702702702702703, 'fairly by': 0.02702702702702703, 'is not': 0.02702702702702703, 'was not': 0.02702702702702703, 'worth it': 0.02702702702702703, 'you are': 0.02702702702702703, 'i am': 0.02702702702702703, 'now count': 0.027027027027027

In [None]:
def generate_story(markov_model, limit=100, start='my god'):
    n = 0
    curr_state = start
    next_state = None
    story = ""
    story+=curr_state+" "
    while n<limit:
        next_state = random.choices(list(markov_model[curr_state].keys()),
                                    list(markov_model[curr_state].values()))

        curr_state = next_state[0]
        story+=curr_state+" "
        n+=1
    return story

In [None]:
for i in range(20):
    print(str(i)+". ", generate_story(markov_model, start="dear holmes", limit=8))

0.  dear holmes i have not been cleared yet sherlock holmes you compel me to say whether the present 
1.  dear holmes i thought i can not allow yourself to the fact that rodger prescott of evil memory 
2.  dear holmes i exclaimed and then put up the shutters a man of his sudden and so terrible 
3.  dear holmes i exclaimed it is perhaps the villain was softened by the womans character did you ask 
4.  dear holmes i thought as much knowledge of your stepfather why what can it mean who is cadogan 
5.  dear holmes if i had not been able to bring peace to many troubled souls i trust that 
6.  dear holmes he has broken the doctor said holmes blandly you have introduced yourselves i can not promise 
7.  dear holmes if i hadnt sworn not to marry anyone else while he lived without any open scandal 
8.  dear holmes i thought when he first met garcia but i found my plans very seriously to finding 
9.  dear holmes said i the two warders had been shot through the dark shrubbery amid the labyrinth o

In [None]:
for i in range(20):
    print(str(i)+". ", generate_story(markov_model, start="my dear", limit=8))

0.  my dear fellow and must lead her a most uneasy life yet i have his letters were to 
1.  my dear watson there i think that my poor father i have explained sir eustace was a confirmed 
2.  my dear watson but it is incredible that such a situation it is most important said holmes and 
3.  my dear sir cried dr mortimer was reading had it all every man and choleric his passion is 
4.  my dear arthur i found him in deep emotion pray continue i said your telegram was soon followed 
5.  my dear sir knowing the vindictive character of a free present of interest to do so we have 
6.  my dear fellow how can you tell me any message for me to cause the frail thread to 
7.  my dear doctor said he nodding at the instant that a human countenance is capable of stopping it 
8.  my dear fellow for a german mr von bork you are a smart man and that he would 
9.  my dear watson said he unquestionably it is really mr holmes said he i shall write two letters 
10.  my dear fellow be it so chanced that some

In [None]:
for i in range(20):
    print(str(i)+". ", generate_story(markov_model, start="i would", limit=8))

0.  i would send you one of the few days which our start the doctor from interfering and the 
1.  i would rather die under my companions guidance we made our way into the room i had to 
2.  i would pay ten that would hardly do he cried i understand it admits that she had passed 
3.  i would hardly go out at his agony his drawn brows and the granite moulding of the inflexible 
4.  i would take this chair by the fire i ventured to say and do nothing which aroused your 
5.  i would spend my life hiking round the world in search of them i get so near him 
6.  i would have nothing further to say only to ask any questions mr holmes i never even knew 
7.  i would see her it took all his habitual coolness was in his that any other fact by 
8.  i would move said the secretary examining the house myself i was aware that you will have to 
9.  i would do justice upon him and buried his knife but the crash of the lamp was out 
10.  i would only ask a little help why said my time is seared into my m

In [None]:
print(generate_story(markov_model, start="the case", limit=100))

the case is the writing pooh pooh forgery my private room at the union house until it comes back to the time of my readers in the singular old observance called the ragged shaw now watson the fair susan who waited upon us at our hotel holmes tore it open a coat which was found in the boys round before evening to meet in lodge one of these papers they should not rain before we are making a fool of myself he gasped not at all except into the garden gate swung open and shut then came the death of her husband and her blazing eyes bounding after its victim hurl him to the rude was a small turban of the same reason no powder on her nose that proved to be kicked from here to the north of england and cut him over here and there great strips had become detached and hung like an open wound lay low in the same relative position to prove nothing i took the matter a small sliding shutter and plunging in his power i am sure that you were a few footmarks and the terra del fuegians the average height 