In [1]:
import nltk
import numpy as np
import random
import string
import bs4 as bs
import urllib.request
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\VCM\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\VCM\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\VCM\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
# Fetching data from webpage
link = urllib.request.urlopen('https://en.wikipedia.org/wiki/Python_(programming_language)')
link = link.read()

In [4]:
data = bs.BeautifulSoup(link, 'lxml')
data_paragraphs = data.find_all('p')
data_text = ''
for para in data_paragraphs:
    data_text += para.text

In [5]:
data_text



In [6]:
data_text = data_text.lower()
data_text = re.sub(r'\[[0-9]*\]', ' ', data_text)
data_text = re.sub(r'\s+', ' ', data_text)

In [7]:
# Tokenization
sen = nltk.sent_tokenize(data_text)
words = nltk.word_tokenize(data_text)

In [8]:
sen

[' python is a high-level, general-purpose programming language.',
 'its design philosophy emphasizes code readability with the use of significant indentation.',
 'python is dynamically typed and garbage-collected.',
 'it supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming.',
 'it is often described as a "batteries included" language due to its comprehensive standard library.',
 'guido van rossum began working on python in the late 1980s as a successor to the abc programming language and first released it in 1991 as python 0.9.0. python 2.0 was released in 2000. python 3.0, released in 2008, was a major revision not completely backward-compatible with earlier versions.',
 'python 2.7.18, released in 2020, was the last release of python 2. python consistently ranks as one of the most popular programming languages, and has gained widespread use in the machine learning community.',
 'python was invented in the

In [9]:
words

['python',
 'is',
 'a',
 'high-level',
 ',',
 'general-purpose',
 'programming',
 'language',
 '.',
 'its',
 'design',
 'philosophy',
 'emphasizes',
 'code',
 'readability',
 'with',
 'the',
 'use',
 'of',
 'significant',
 'indentation',
 '.',
 'python',
 'is',
 'dynamically',
 'typed',
 'and',
 'garbage-collected',
 '.',
 'it',
 'supports',
 'multiple',
 'programming',
 'paradigms',
 ',',
 'including',
 'structured',
 '(',
 'particularly',
 'procedural',
 ')',
 ',',
 'object-oriented',
 'and',
 'functional',
 'programming',
 '.',
 'it',
 'is',
 'often',
 'described',
 'as',
 'a',
 '``',
 'batteries',
 'included',
 "''",
 'language',
 'due',
 'to',
 'its',
 'comprehensive',
 'standard',
 'library',
 '.',
 'guido',
 'van',
 'rossum',
 'began',
 'working',
 'on',
 'python',
 'in',
 'the',
 'late',
 '1980s',
 'as',
 'a',
 'successor',
 'to',
 'the',
 'abc',
 'programming',
 'language',
 'and',
 'first',
 'released',
 'it',
 'in',
 '1991',
 'as',
 'python',
 '0.9.0.',
 'python',
 '2.0',
 '

In [10]:
# Lemmatization
wnlem = nltk.stem.WordNetLemmatizer()

In [11]:
def perform_lemmatization(tokens):
    return [wnlem.lemmatize(token) for token in tokens]

pr = dict((ord(punctuation), None) for punctuation in string.punctuation)

def get_processed_text(document):
    return perform_lemmatization(nltk.word_tokenize(document.lower().translate(pr)))


In [12]:
# Greeting inputs and responses
greeting_inputs = ("hey", "hello", "good morning", "good evening", "morning", "hi", "whatsup")
greeting_responses = ["hey", "hey hows you?", "hello, how you doing", "hello", "welcome", "welcome, I am good and you"]

def generate_greeting_response(greeting):
    for token in greeting.split():
        if token.lower() in greeting_inputs:
            return random.choice(greeting_responses)

In [13]:
# Generate response
def generate_response(user_input):
    bot_response = ''
    sen.append(user_input)
    
    word_vectorizer = TfidfVectorizer(tokenizer=get_processed_text, stop_words='english')
    word_vectors = word_vectorizer.fit_transform(sen)
    similar_vector_values = cosine_similarity(word_vectors[-1], word_vectors)
    similar_sentence_number = similar_vector_values.argsort()[0][-2]
    
    matched_vector = similar_vector_values.flatten()
    matched_vector.sort()
    vector_matched = matched_vector[-2]
    
    if vector_matched == 0:
        bot_response = bot_response + "I am sorry I don't understand"
    else:
        bot_response = bot_response + sen[similar_sentence_number]
    
    sen.pop()
    return bot_response


In [None]:
continue_flag = True

print("Hello I am AR810.")
while continue_flag:
    human = input().lower()
    if human != 'bye':
        if human in ['thanks', 'thankyou']:
            continue_flag = False
            print("Most Welcome")
        else:
            greeting = generate_greeting_response(human)
            if greeting:
                print("AR810 : " + greeting)
            else:
                print("AR810 : ", end="")
                print(generate_response(human))
    else:
        continue_flag = False
        print("AR810 : Good Bye")

Hello I am AR810.
