In [0]:
#Description : this is a 'self learning Chatbot'

In [0]:
#Install the package nltk
pip install nltk

In [0]:
#Install the package newspaper3k
pip install newspaper3k

In [0]:
#Import libraries
from newspaper import Article
import random
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import numpy as np
import warnings

In [0]:
#Ignoring warning messages
warnings.filterwarnings('ignore')

In [0]:
#Download packages from NLTK
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

True

In [0]:
#Get the Article Url from which we want to scrap the data
article = Article('https://www.mayoclinic.org/diseases-conditions/chronic-kidney-disease/symptoms-causes/syc-20354521')
article.download()
article.parse()
article.nlp()
downloaded_article = article.text

#Print corpus/text
print(downloaded_article)
#whole article will be printed here

In [0]:
#Tokenization
text = downloaded_article
#Converting the text into a list of sentences
sentences = nltk.sent_tokenize(text)

#Printing the list of sentences
print(sentences)

In [0]:
#Create a dictionary (key : value) pair to remove punctuations
remove_punc_dict = dict( (ord(punct),None) for punct in string.punctuation) #ord is used to get a value instead of symbols of punctuation

#Print the punctuations
print(string.punctuation)

#Print the dictionary
print(remove_punc_dict)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
{33: None, 34: None, 35: None, 36: None, 37: None, 38: None, 39: None, 40: None, 41: None, 42: None, 43: None, 44: None, 45: None, 46: None, 47: None, 58: None, 59: None, 60: None, 61: None, 62: None, 63: None, 64: None, 91: None, 92: None, 93: None, 94: None, 95: None, 96: None, 123: None, 124: None, 125: None, 126: None}


In [0]:
#Create a function to return a list of limitized lower case words after removng punctuations
def LimitNormalize(text):
  return nltk.word_tokenize(text.lower().translate(remove_punc_dict))

#Print the tokenization text
print(LimitNormalize(text))

['overview', 'chronic', 'kidney', 'disease', 'also', 'called', 'chronic', 'kidney', 'failure', 'describes', 'the', 'gradual', 'loss', 'of', 'kidney', 'function', 'your', 'kidneys', 'filter', 'wastes', 'and', 'excess', 'fluids', 'from', 'your', 'blood', 'which', 'are', 'then', 'excreted', 'in', 'your', 'urine', 'when', 'chronic', 'kidney', 'disease', 'reaches', 'an', 'advanced', 'stage', 'dangerous', 'levels', 'of', 'fluid', 'electrolytes', 'and', 'wastes', 'can', 'build', 'up', 'in', 'your', 'body', 'in', 'the', 'early', 'stages', 'of', 'chronic', 'kidney', 'disease', 'you', 'may', 'have', 'few', 'signs', 'or', 'symptoms', 'chronic', 'kidney', 'disease', 'may', 'not', 'become', 'apparent', 'until', 'your', 'kidney', 'function', 'is', 'significantly', 'impaired', 'treatment', 'for', 'chronic', 'kidney', 'disease', 'focuses', 'on', 'slowing', 'the', 'progression', 'of', 'the', 'kidney', 'damage', 'usually', 'by', 'controlling', 'the', 'underlying', 'cause', 'chronic', 'kidney', 'disease'

In [0]:
#Keyword Matching

#Greetings input
GREETING_INPUTS = ["hi","hii","hello","hola", "greetings","wassup","hey"]

#Greetings Output or responses
GREETING_RESPONSES = ["howdy","hi","hey there","hey", "what's good","hello"]

#Function to return a random greeting response to users greeting
def greeting(sentence):
  #If users input is a greeting, then return randomly chosen greeting response
  for word in sentence.split():
    if word.lower() in GREETING_INPUTS:
      return random.choice(GREETING_RESPONSES)


In [0]:
#Generate the response of chatbot
def response(user_response):
  #The users response / query
  #user_response = 'What is Chronic kidney disease'

  #Making the response lower case
  user_response = user_response.lower()

  #print the users query / response
  #print(user_response)

  #Set the chatbot response to an empty string
  robo_response = ''

  #Appending the users response to the sentence list
  sentences.append(user_response)

  ##Print the sentence list after appending the users response
  #print(sentences)

  #Create a TfidfVectorizer object  #Term frequency-inverse document frequency
  TfidfVec = TfidfVectorizer(tokenizer = LimitNormalize, stop_words='english')

  #Convert the text to a matrix of TF - IDF features
  tfidf = TfidfVec.fit_transform(sentences)

  #Print the TFIDF features
  #print(tfidf)

  #Getting the measure of similarity(similarity scores)
  vals = cosine_similarity(tfidf[-1], tfidf)

  #Print the similarity score
  #print(vals)

  #Getting the index of the most similar text/sentence to the users response
  idx = vals.argsort()[0][-2] 

  #Reducing the dimensionality of vals
  flat = vals.flatten()

  #Sorting the list in ascending order
  flat.sort()

  #Getting the most similar score to the users resonse
  score = flat[-2]
  #Printing the similarity score
  #print(score)

  #If the value of the varibale 'score' is 0 then there is no text similar to the users response
  if (score == 0):
    robo_response = robo_response + " I apologise, I don't understand"
  else :
    robo_response = robo_response + sentences[idx] 

  #PRint the chatbot response 
  #print(robo_response)  

  #Removing user response from sentences list
  sentences.remove(user_response)

  return robo_response


In [0]:
#Lets start chating with our chatbot
flag = True
print("DOCBot: I am Doctor Bot or DOCBot for short. I will answer your query for chronic kidney disease. If you want to exit please type Bye! ") 

while (flag == True):
  user_response = input()
  user_response = user_response.lower()

  if (user_response != 'bye'):
    if (user_response == 'thanks' or user_response == 'thank you' or user_response == 'ty'):
      flag = False
      print("DOCBot : You are welcome!")
    else:
      if(greeting(user_response) != None):
        print("DOCBot : "+ greeting(user_response))
      else:
        print("DOCBot : "+ response(user_response))
  else:
    flag = False
    print("DOCBot : Chat with you Later!")