In [1]:
import nltk
import string
import random
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [60]:
from bs4 import BeautifulSoup
import requests
import re

def url_to_string(url):
    res = requests.get(url)
     # Send an HTTP GET request to the specified URL and store the response
    html = res.text
     # Get the HTML content of the page from the response
    soup = BeautifulSoup(html, "html5lib")
     # Create a BeautifulSoup object to parse the HTML content
    for script in soup(["script", "style", "aside"]):
         script.extract()
     # Remove unwanted elements from the HTML i.e tags
    return" ".join(re.split(r"[\n\t]+", soup.get_text()))  
     # Extract the text content and split it into new line and join to form a single string  

In [None]:
Beautiful Soup is a Python library that is used for web scraping purposes to pull the data out
of HTML and XML files.


In [61]:
# Copy the link which you want to extract
data = url_to_string ("https://en.wikipedia.org/wiki/Narendra_Modi") 

In [62]:
data

' Harry Potter (film series) - Wikipedia Jump to content Main menu Main menu move to sidebar hide Navigation Main pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate Contribute HelpLearn to editCommunity portalRecent changesUpload file Languages Language links are at the top of the page across from the title. Search Search Create accountLog in Personal tools  Create account Log in Pages for logged out editors learn more ContributionsTalk Contents move to sidebar hide (Top) 1Origins Toggle Origins subsection 1.1Casting the roles of Harry, Ron, and Hermione 2Production Toggle Production subsection 2.1Directors 2.2Scripts 2.3Cast and crew 2.4Set design 2.5Cinematography 2.6Editing 2.7Music 2.8Visual effects 2.9Final filming 3Films Toggle Films subsection 3.1Harry Potter and the Philosopher\'s Stone (2001) 3.2Harry Potter and the Chamber of Secrets (2002) 3.3Harry Potter and the Prisoner of Azkaban (2004) 3.4Harry Potter and the Goblet of Fire (2005) 3.5Harry Potter and

In [5]:
nltk.download("punkt")   # punkt tokenizer
nltk.download("wordnet")  # wordnet dictionary  

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Text Pre-Processing

In [63]:
# Converting data into lowercase
data = data.lower()
data

In [65]:
# Converting data into list of sentences
sentence = nltk.sent_tokenize(data)  

# Converting data into list of words
word = nltk.word_tokenize(data)     

In [66]:
sentence[ :10]

[' harry potter (film series) - wikipedia jump to content main menu main menu move to sidebar hide navigation main pagecontentscurrent eventsrandom articleabout wikipediacontact usdonate contribute helplearn to editcommunity portalrecent changesupload file languages language links are at the top of the page across from the title.',
 "search search create accountlog in personal tools  create account log in pages for logged out editors learn more contributionstalk contents move to sidebar hide (top) 1origins toggle origins subsection 1.1casting the roles of harry, ron, and hermione 2production toggle production subsection 2.1directors 2.2scripts 2.3cast and crew 2.4set design 2.5cinematography 2.6editing 2.7music 2.8visual effects 2.9final filming 3films toggle films subsection 3.1harry potter and the philosopher's stone (2001) 3.2harry potter and the chamber of secrets (2002) 3.3harry potter and the prisoner of azkaban (2004) 3.4harry potter and the goblet of fire (2005) 3.5harry potter

In [67]:
word[ :10]

['harry', 'potter', '(', 'film', 'series', ')', '-', 'wikipedia', 'jump', 'to']

In [68]:
# Lemmatizing
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

In [69]:
def show_lemmas(tokens):
    return[lemma.lemmatize(token)  for token in tokens          ]

In [70]:
# Removing Punctuation
remove_punct = dict((ord(punct), None)  for punct in string.punctuation)

In [71]:
# Normalizing 
def normalize(text):
    return show_lemmas(nltk.word_tokenize(text.lower().translate(remove_punct)))

"""
By using above function, you can process text data by first tokenizing it into words, converting
them to lowercase, removing punctuation, and then lemmatizing each word to obtain its base form

"""

# Defining the Greetings

In [73]:
GREET_INPUTS = ("hello", "hi", "greetings", "sup", "what's up","hey")
GREET_RESPONSES = ["hi", "hey", "hi there", "hi,ask me what you want to know", "hello",
                   "I am glad! You are talking to me"]

In [74]:
def greet(sentence):
    for word in sentence.split():
        if word.lower() in GREET_INPUTS:
            return random.choice(GREET_RESPONSES)

# Defining Response Generation

In [76]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

converts a collection of raw documents (sentences or paragraphs) into a numerical feature matrix,
where each row represents a document, and each column represents a 

In [77]:
def response(user_response):
  robo1_response=''
  TfidfVec = TfidfVectorizer(tokenizer = normalize, stop_words = "english")
  tfidf = TfidfVec.fit_transform(sentence)
  vals = cosine_similarity(tfidf[-1], tfidf)
  # calculates cosine similarity between the last sentence and all other sentences in the tfidf matrix. 
  idx = vals.argsort()[0][-2]
  # sorts the cosine similarities in ascending order and then selects the second-to-last index.
  flat = vals.flatten()
  flat.sort()
  req_tfidf = flat[-2]
  # Stores the second-to-last (second highest) similarity value 

  if(req_tfidf==0):   # if 0 no similarity
    robo1_response = robo1_response + "I am sorry! I don't understand you"
    return robo1_response
  else:
    robo1_response = robo1_response + sentence[idx]
    return robo1_response

# Defining Conversation Start and End 

In [78]:
flag = True
print("BOT: I'm ChatBot. Ask me about India's PM. If you want to exit any time,just type Bye!")
while(flag==True):
    user_response = input()
    user_response = user_response.lower()
    if(user_response != "bye"):
        if(user_response == "thanks" or user_response == "thank you"):
            flag=False
            print("ChatBot: You are welcome..")
        else:
            if(greet(user_response)!= None):
                print("ChatBot: " + greet(user_response))
            else:
                sentence.append(user_response)
                word = word+nltk.word_tokenize(user_response)
                final_words = list(set(word))
                print("BOT: ",end="")
                print(response(user_response))
                sentence.remove(user_response)
    else:
        flag = False
        print("ChatBot: Goodbye! Take care <3 ")

BOT: My name is ChatBot.I will tell about India. if you want to exit any time,just type Bye!
hi
ChatBot: hello
harry
BOT: ^ "harry potter".
played by
BOT: "[10] radcliffe had already established himself as an actor in the 1999 bbc television production of david copperfield in which he played the title role's childhood years.
voldemort
BOT: harry becomes aware that voldemort is after a prophecy which reveals: "neither can live while the other survives".
sirius black
BOT: professor r. j. lupin joins the staff as defence against the dark arts teacher, while convicted murderer sirius black escapes from azkaban.
lumos
BOT: I am sorry! I don't understand you
part 2
BOT: retrieved 2 march 2011.
hagrid
BOT: at the age of eleven, half-giant rubeus hagrid informs him that he is actually a wizard and that his parents were murdered by an evil wizard named lord voldemort.
deathly hallows
BOT: ^ "harry potter and the deathly hallows: part i (2010)".
hermoine
BOT: I am sorry! I don't understand you
r

bye
ChatBot: Goodbye! Take care <3 
