In [1]:
import nltk

In [2]:
import re
import os
import csv
from nltk.stem.snowball import SnowballStemmer
import random
from nltk.classify import SklearnClassifier
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
import pandas as pd

In [3]:
#get multiple ouputs in the same line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

#ignore all warnings
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

In [4]:
#display all rows and cols of a dataframe intead of a truncated version
from IPython.display import display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
#Preprocess

sentence = "The computer was born to solve problems that did not exist before"
sentence2 = "Most of the good programmers do programming not because they expect to get paid or get adulation by the public, but because it is fun to program."

In [6]:
#convert sentence to lower case

'This' == 'this'
print('AbcdEFgH'.lower())
sentence.lower()
sentence2.lower()

False

abcdefgh


'the computer was born to solve problems that did not exist before'

'most of the good programmers do programming not because they expect to get paid or get adulation by the public, but because it is fun to program.'

In [7]:
#tokenize and extrax=ction of individual features

tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(sentence)
tokens
tokens2 = tokenizer.tokenize(sentence2)
tokens2

['The',
 'computer',
 'was',
 'born',
 'to',
 'solve',
 'problems',
 'that',
 'did',
 'not',
 'exist',
 'before']

['Most',
 'of',
 'the',
 'good',
 'programmers',
 'do',
 'programming',
 'not',
 'because',
 'they',
 'expect',
 'to',
 'get',
 'paid',
 'or',
 'get',
 'adulation',
 'by',
 'the',
 'public',
 'but',
 'because',
 'it',
 'is',
 'fun',
 'to',
 'program']

In [8]:
#Stopwords : Filter words to remove non useful words
filtered_words = [w for w in tokens if not w in stopwords.words('english')]
filtered_words

['The', 'computer', 'born', 'solve', 'problems', 'exist']

In [9]:
filtered_words = [w for w in tokens2 if not w in stopwords.words('english')]
filtered_words

['Most',
 'good',
 'programmers',
 'programming',
 'expect',
 'get',
 'paid',
 'get',
 'adulation',
 'public',
 'fun',
 'program']

In [10]:
def preprocess(sentence):
    sentence = sentence.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    filtered_words = [w for w in tokens if not w in stopwords.words('english')]
    return filtered_words

In [11]:
preprocessed_sentence = preprocess(sentence)
print(preprocessed_sentence)

['computer', 'born', 'solve', 'problems', 'exist']


In [12]:
preprocess(sentence2)

['good',
 'programmers',
 'programming',
 'expect',
 'get',
 'paid',
 'get',
 'adulation',
 'public',
 'fun',
 'program']

In [13]:
#tagging

tags = nltk.pos_tag(preprocess(sentence))
print(tags)

[('computer', 'NN'), ('born', 'VBN'), ('solve', 'VB'), ('problems', 'NNS'), ('exist', 'VBP')]


In [14]:
tags = nltk.pos_tag(preprocess(sentence2))
print(tags)

[('good', 'JJ'), ('programmers', 'NNS'), ('programming', 'VBG'), ('expect', 'VBP'), ('get', 'NN'), ('paid', 'VBN'), ('get', 'VB'), ('adulation', 'JJ'), ('public', 'JJ'), ('fun', 'NN'), ('program', 'NN')]


In [15]:
#extracting only nouns and verbs
def extract_tagged(sentence):
    features = []
    for tagged_word in sentence:
        word, tag= tagged_word
        if tag=='NN' or tag=='VBN' or tag=='NNS' or tag=='VBP' or tag=='RB' or tag=='VBZ' or tag=='VBG' or tag=='PRP' or tag=='JJ':
            features.append(word)
    return features

In [16]:
extract_tagged(tags)

['good',
 'programmers',
 'programming',
 'expect',
 'get',
 'paid',
 'adulation',
 'public',
 'fun',
 'program']

In [17]:
#lemmatize word
lmtzr = WordNetLemmatizer()
print(lmtzr.lemmatize('feet'))
print(lmtzr.lemmatize('giving'))

foot
giving


In [18]:
#stem words
words_for_stemming = ['stem', 'stemming', 'stemmed', 'stemmer', 'stems', 'feet', 'willing']

In [19]:
stemmer = SnowballStemmer('english')
[stemmer.stem(x) for x in words_for_stemming]

['stem', 'stem', 'stem', 'stemmer', 'stem', 'feet', 'will']

In [20]:
#putting it all together
def extract_features(text):
    words = preprocess(text)
    tags = nltk.pos_tag(words)
    extracted_features = extract_tagged(tags)
    stemmed_words = [stemmer.stem(x) for x in extracted_features]
    result = [lmtzr.lemmatize(x) for x in stemmed_words]
    return result

In [21]:
sentence

'The computer was born to solve problems that did not exist before'

In [22]:
words = extract_features(sentence)
words

['comput', 'born', 'problem', 'exist']

In [23]:
sentence2

'Most of the good programmers do programming not because they expect to get paid or get adulation by the public, but because it is fun to program.'

In [24]:
words = extract_features(sentence2)
words

['good',
 'programm',
 'program',
 'expect',
 'get',
 'paid',
 'adul',
 'public',
 'fun',
 'program']

In [25]:
extract_features('Everybody should learn to program a computer, because it teaches you how to think')

['everybodi', 'learn', 'program', 'comput', 'teach', 'think']

In [26]:
#implementing bag of words
def word_feats(words):
    return dict([(word, True) for word in words])

In [27]:
word_feats(words)

{'good': True,
 'programm': True,
 'program': True,
 'expect': True,
 'get': True,
 'paid': True,
 'adul': True,
 'public': True,
 'fun': True}

In [28]:
#parsing the whole document
def extract_feature_from_doc(data):
    result =[]
    corpus =[]
    #the responses of chatboat
    answers={}
    for (text,category,answer) in data:
        features = extract_features(text)
        corpus.append(features)
        result.append((word_feats(features), category))
        answers[category]= answer
    return (result, sum(corpus,[]), answers)

In [29]:
extract_feature_from_doc([['This is the input text from user', 'category', 'answer to give']])

([({'input': True, 'user': True}, 'category')],
 ['input', 'user'],
 {'category': 'answer to give'})

In [30]:
def get_content(filename):
    doc = os.path.join(filename)
    with open(doc,'r') as content_file:
        lines = csv.reader(content_file,delimiter='|')
        data = [x for x in lines if len(x) ==3]
        return data

In [31]:
filename= 'D:/DataScience/NLP/ABHIChatbotData.txt'
data = get_content(filename)
data

[['Hello',
  'Greetings',
  'Hello. I am ABHIbot. I will serve your leave enquiries.'],
 ['hi hello',
  'Greetings',
  'Hello. I am ABHIbot. I will serve your leave enquiries.'],
 ['hi ',
  'Greetings',
  'Hello. I am ABHIbot. I will serve your leave enquiries.'],
 ['hi',
  'Greetings',
  'Hello. I am ABHIbot. I will serve your leave enquiries.'],
 ['hi',
  'Greetings',
  'Hello. I am ABHIbot. I will serve your leave enquiries.'],
 ['hey',
  'Greetings',
  'Hello. I am ABHIbot. I will serve your leave enquiries.'],
 ['hey',
  'Greetings',
  'Hello. I am ABHIbot. I will serve your leave enquiries.'],
 ['hey, hello',
  'Greetings',
  'Hello. I am ABHIbot. I will serve your leave enquiries.'],
 ['Good morning',
  'Morning',
  'Good Morning. I am ABHIbot. I will serve your leave enquiries.'],
 ['Good afternoon',
  'Afternoon',
  'Good afternoon, I am ABHIbot. I will serve your leave enquiries.'],
 ['Good evening',
  'Evening',
  'Good evening. I am ABHIbot. I will serve your leave enquirie

In [32]:
features_data, corpus, answers = extract_feature_from_doc(data)

In [33]:
print(features_data[50])

({'number': True, 'option': True, 'leav': True, 'taken': True}, 'Utilized-Optional-Leaves')


In [34]:
corpus

['hello',
 'hi',
 'hello',
 'hi',
 'hi',
 'hi',
 'hey',
 'hey',
 'hey',
 'hello',
 'good',
 'morn',
 'good',
 'afternoon',
 'good',
 'even',
 'good',
 'night',
 'today',
 'want',
 'help',
 'need',
 'help',
 'help',
 'want',
 'help',
 'want',
 'assist',
 'help',
 'great',
 'talk',
 'great',
 'thank',
 'help',
 'thank',
 'thank',
 'much',
 'thank',
 'thank',
 'much',
 'mani',
 'type',
 'leav',
 'type',
 'leav',
 'type',
 'leav',
 'type',
 'leav',
 'type',
 'mani',
 'leav',
 'taken',
 'mani',
 'leav',
 'alreadi',
 'taken',
 'mani',
 'annual',
 'leav',
 'mani',
 'annual',
 'leav',
 'taken',
 'mani',
 'annual',
 'leav',
 'alreadi',
 'taken',
 'annual',
 'leav',
 'count',
 'taken',
 'mani',
 'annual',
 'leav',
 'taken',
 'number',
 'annual',
 'leav',
 'taken',
 'annual',
 'leav',
 'taken',
 'number',
 'annual',
 'leav',
 'alreadi',
 'taken',
 'annual',
 'leav',
 'taken',
 'annual',
 'leav',
 'alreadi',
 'taken',
 'number',
 'annual',
 'leav',
 'taken',
 'number',
 'annual',
 'leav',
 'taken'

In [35]:
answers

{'Greetings': 'Hello. I am ABHIbot. I will serve your leave enquiries.',
 'Morning': 'Good Morning. I am ABHIbot. I will serve your leave enquiries.',
 'Afternoon': 'Good afternoon, I am ABHIbot. I will serve your leave enquiries.',
 'Evening': 'Good evening. I am ABHIbot. I will serve your leave enquiries.',
 'Goodbye': 'Good night. Take care.',
 'Opening': "I'm fine! Thank you. How can I help you?",
 'Help': 'How can I help you?',
 'No-Help': 'Ok sir/madam. No problem. Have a nice day.',
 'Closing': "It's glad to know that I have been helpful. Have a good day!",
 'Leaves-Type': 'Currently I know about two: annual and optional leaves.',
 'Default-Utilized-Annual-Leaves': 'You have used 12 annual leaves.',
 'Utilized Annual-Leaves': 'You have taken 12 annual leaves.',
 'Utilized-Annual-Leaves': 'You have taken 12 annual leaves.',
 'Utilized-Optional-Leaves': 'You have taken 1 optional leaves.',
 'Balance-Annual-Leaves': 'You have 25 annual leaves remaining.',
 'Balance-Optional-Leaves'

In [36]:
#Training a model using these features
split_ratio = 0.8

def split_dataset(data, split_ratio):
    random.shuffle(data)
    data_length = len(data)
    train_split = int(data_length * split_ratio)
    return (data[:train_split]), (data[train_split:])

In [37]:
training_data, test_data = split_dataset(features_data, split_ratio)
training_data

[({'number': True, 'forward': True, 'leav': True}, 'CF'),
 ({'number': True, 'option': True, 'leav': True, 'remain': True},
  'Balance-Optional-Leaves'),
 ({'great': True}, 'Closing'),
 ({'great': True, 'talk': True}, 'Closing'),
 ({'mani': True,
   'carri': True,
   'forward': True,
   'previous': True,
   'year': True},
  'CF'),
 ({'annual': True, 'leav': True}, 'Balance-Annual-Leaves'),
 ({'mani': True, 'option': True, 'leav': True, 'taken': True},
  'Utilized-Optional-Leaves'),
 ({'thank': True}, 'Closing'),
 ({'annual': True, 'leav': True, 'balanc': True}, 'Balance-Annual-Leaves'),
 ({'mani': True, 'option': True, 'leav': True}, 'Balance-Optional-Leaves'),
 ({'type': True, 'leav': True}, 'Leaves-Type'),
 ({'mani': True, 'option': True, 'leav': True, 'i_hav': True},
  'Balance-Optional-Leaves'),
 ({'number': True,
   'option': True,
   'leav': True,
   'alreadi': True,
   'taken': True},
  'Utilized-Optional-Leaves'),
 ({'good': True, 'afternoon': True}, 'Afternoon'),
 ({'annual': 

In [38]:
#save the data
np.save('training_data', training_data)
np.save('test_data', test_data)

In [39]:
#classification using Decision tree
training_data = np.load('training_data.npy', allow_pickle=True)
test_data = np.load('test_data.npy', allow_pickle=True)

In [43]:
def train_using_decision_tree(training_data, test_data):
    classifier =nltk.classify.DecisionTreeClassifier.train(training_data, entropy_cutoff=0.6, support_cutoff=6)
    classifier_name = type(classifier).__name__
    training_set_accuracy = nltk.classify.accuracy(classifier, training_data)
    print('training set accuracy:', training_set_accuracy)
    test_set_accuracy = nltk.classify.accuracy(classifier, test_data)
    print('test set accuracy :', test_set_accuracy)
    return classifier, classifier_name, test_set_accuracy, training_set_accuracy

In [44]:
dtclassifier, classifer_name, test_set_accuracy, training_set_accuracy = train_using_decision_tree(training_data, test_data)

training set accuracy: 0.9191919191919192
test set accuracy : 0.72


In [47]:
def train_using_naiveBayes(training_data, test_data):
    classifier =nltk.classify.NaiveBayesClassifier.train(training_data)
    classifier_name = type(classifier).__name__
    training_set_accuracy = nltk.classify.accuracy(classifier, training_data)
    print('training set accuracy:', training_set_accuracy)
    test_set_accuracy = nltk.classify.accuracy(classifier, test_data)
    print('test set accuracy :', test_set_accuracy)
    return classifier, classifier_name, test_set_accuracy, training_set_accuracy

In [48]:
classifier, classifer_name, test_set_accuracy, training_set_accuracy = train_using_naiveBayes(training_data, test_data)

training set accuracy: 0.9090909090909091
test set accuracy : 0.64


In [49]:
print(len(classifier.most_informative_features()))

68


In [50]:
classifier.show_most_informative_features()

Most Informative Features
                    leav = None           Closin : Balanc =     12.4 : 1.0
                   taken = None           Balanc : Utiliz =      5.1 : 1.0
                    mani = True           Defaul : CF     =      4.7 : 1.0
                 alreadi = True           Defaul : Utiliz =      4.4 : 1.0
                    help = True             Help : Closin =      3.5 : 1.0
                   count = True           Utiliz : CF     =      3.2 : 1.0
                  remain = None           Utiliz : Balanc =      3.1 : 1.0
                   carri = None           Utiliz : CF     =      3.0 : 1.0
                   thank = None           Utiliz : Closin =      2.7 : 1.0
                    help = None           Utiliz : No-Hel =      2.6 : 1.0


In [51]:
classifier.classify(({'mani': True, 'option': True, 'leav': True}))

'Utilized-Optional-Leaves'

In [52]:
extract_features('hello')

['hello']

In [53]:
word_feats(extract_features('hello'))

{'hello': True}

In [55]:
input_sentence = 'how many balanced leaves do I have?'
classifier.classify(word_feats(extract_features(input_sentence)))

'Utilized-Optional-Leaves'

In [56]:
def reply(input_sentence):
    category = dtclassifier.classify(word_feats(extract_features(input_sentence)))
    return answers[category]

In [57]:
reply('hello')

'Hello. I am ABHIbot. I will serve your leave enquiries.'

In [60]:
reply('How many annual leaves do I have left?')

'You have 25 annual leaves remaining.'

In [61]:
reply('how many leaves have I taken')

'You have used 12 annual leaves.'

In [62]:
reply('how many leaves I taken')

'You have used 12 annual leaves.'