In [1]:
import nltk
import string
import random
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [2]:
from bs4 import BeautifulSoup
import requests
import re

def url_to_string(url):
    res = requests.get(url)
     # Send an HTTP GET request to the specified URL and store the response
    html = res.text
     # Get the HTML content of the page from the response
    soup = BeautifulSoup(html, "html5lib")
     # Create a BeautifulSoup object to parse the HTML content
    for script in soup(["script", "style", "aside"]):
         script.extract()
     # Remove unwanted elements from the HTML i.e tags
    return" ".join(re.split(r"[\n\t]+", soup.get_text()))  
     # Extract the text content and split it into new line and join to form a single string  

In [3]:
# Copy the link which you want to extract
data = url_to_string ("https://en.wikipedia.org/wiki/Harry_Potter") 
# data

In [4]:
nltk.download("punkt")   # punkt tokenizer
nltk.download("wordnet")  # wordnet dictionary  

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Text Pre-Processing

In [5]:
# Converting data into lowercase
data = data.lower()
# data

In [6]:
# Converting data into list of sentences
sentence = nltk.sent_tokenize(data)  

# Converting data into list of words
word = nltk.word_tokenize(data)     

In [7]:
sentence[ :10]

[' harry potter - wikipedia jump to content main menu main menu move to sidebar hide navigation main pagecontentscurrent eventsrandom articleabout wikipediacontact usdonate contribute helplearn to editcommunity portalrecent changesupload file languages language links are at the top of the page across from the title.',
 'search search create accountlog in personal tools  create account log in pages for logged out editors learn more contributionstalk contents move to sidebar hide (top) 1plot toggle plot subsection 1.1early years 1.2voldemort returns 2style and allusions toggle style and allusions subsection 2.1genre and style 2.2allusions 3themes 4development history toggle development history subsection 4.1publishing history 4.2translations 4.3cover art 5reception toggle reception subsection 5.1commercial success 5.2literary criticism 5.3thematic critique 5.4controversies 6legacy toggle legacy subsection 6.1influence on literature 6.2cultural impact 7awards, honours, and recognition 8ad

In [8]:
word[ :10]

['harry',
 'potter',
 '-',
 'wikipedia',
 'jump',
 'to',
 'content',
 'main',
 'menu',
 'main']

In [9]:
# Lemmatizing
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

In [10]:
def show_lemmas(tokens):
    return[lemma.lemmatize(token)  for token in tokens          ]

In [11]:
# Removing Punctuation
remove_punct = dict((ord(punct), None)  for punct in string.punctuation)

In [12]:
# Normalizing 
def normalize(text):
    return show_lemmas(nltk.word_tokenize(text.lower().translate(remove_punct)))

By using above function, you can process text data by first tokenizing it into words, converting
them to lowercase, removing punctuation, and then lemmatizing(show_lemmas) each word to obtain its
base form

# Defining the Greetings

In [13]:
GREET_INPUTS = ("hello", "hi", "greetings", "sup", "what's up","hey")
GREET_RESPONSES = ["hi", "hey", "hi there", "hi,ask me what you want to know", "hello",
                   "I am glad! You are talking to me"]

In [14]:
def greet(sentence):
    for word in sentence.split():
        if word.lower() in GREET_INPUTS:
            return random.choice(GREET_RESPONSES)

# Defining Response Generation

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Converting cleaned data into numeric using TFIDF and using cosine similarity to measure similarity 
between vectors(-1 to 1) where 1 indicating similarity and -1 indicating dissimilarity

In [16]:
def response(user_response):
  robo1_response=''
  TfidfVec = TfidfVectorizer(tokenizer = normalize, stop_words = "english")
  tfidf = TfidfVec.fit_transform(sentence)
  vals = cosine_similarity(tfidf[-1], tfidf)
  # calculates cosine similarity between the last sentence and all other sentences in the tfidf matrix. 
  idx = vals.argsort()[0][-2]
  # sorts the cosine similarities in ascending order and then selects the second-to-last index.
  flat = vals.flatten()
  flat.sort()
  req_tfidf = flat[-2]
  # Stores the second-to-last (second highest) similarity value 

  if(req_tfidf==0):   # if 0 no similarity
    robo1_response = robo1_response + "I am sorry! I don't understand you"
    return robo1_response
  else:
    robo1_response = robo1_response + sentence[idx]
    return robo1_response

# Defining Conversation Start and End 

In [17]:
flag = True
print("BOT: I'm ChatBot. Ask me about Harry Potter. If you want to exit any time,just type Bye!")
while(flag==True):
    user_response = input()
    user_response = user_response.lower()
    if(user_response != "bye"):
        if(user_response == "thanks" or user_response == "thank you"):
            flag=False
            print("ChatBot: You are welcome..")
        else:
            if(greet(user_response)!= None):
                print("ChatBot: " + greet(user_response))
            else:
                sentence.append(user_response)
                word = word+nltk.word_tokenize(user_response)
                final_words = list(set(word))
                print("BOT: ",end="")
                print(response(user_response))
                sentence.remove(user_response)
    else:
        flag = False
        print("ChatBot: Goodbye! Take care <3 ")

BOT: I'm ChatBot. Ask me about Harry Potter. If you want to exit any time,just type Bye!
hey
ChatBot: hi,ask me what you want to know
hagrid
BOT: he meets a half-giant named hagrid who invites him to attend the hogwarts school of witchcraft and wizardry.
cedric
BOT: harry manages to escape, but cedric is killed and voldemort is resurrected using harry's blood.
ronald
BOT: I am sorry! I don't understand you
sorting hat
BOT: harry draws the sword of gryffindor from the sorting hat, slays the basilisk and destroys the diary.
lumos
BOT: I am sorry! I don't understand you
tom riddle
BOT: harry learns from a drunken slughorn that he used to teach tom riddle, and that voldemort divided his soul into pieces, creating a series of horcruxes.
thanks
ChatBot: You are welcome..
