In [55]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import re
import numpy as np

# Understanding Vectors and Matrices

## Vectors

Vectors are a one-dimensional array of numbers in which each number could be identified
by its respective indices.   
They are represented as:   
x=[x1  
   x2  
   x3]  

## Matrices

Matrices are an extension of arrays. They are a rectangular array of numbers wherein each
number is identified by two indices. Like vectors, matrices are also represented using
squared brackets, but matrices have both rows and columns, as shown in the following
screenshot.    
A=[x11  x12  
     x21  x22]  

The following code block will give you some perspective about building vectors and
matrices based on text data.

In [56]:
from sklearn.feature_extraction.text import CountVectorizer
X = ("Computers can analyze text", "They do it using vectors and matrices", "Computers can process massive amounts of text data")
vectorizer = CountVectorizer(stop_words='english')
X_vec = vectorizer.fit_transform(X)
print(vectorizer.vocabulary_)
print(X_vec.todense())

{'computers': 2, 'analyze': 1, 'text': 7, 'using': 8, 'vectors': 9, 'matrices': 5, 'process': 6, 'massive': 4, 'amounts': 0, 'data': 3}
[[0 1 1 0 0 0 0 1 0 0]
 [0 0 0 0 0 1 0 0 1 1]
 [1 0 1 1 1 0 1 1 0 0]]


The following output block from the previous code block shows a matrix, wherein each
row corresponds to the document being imported in the same order and each column
corresponds to a unique token whose ordering can be obtained using
the .vocabulary_ function of the CountVectorizer class:  
Once text data is converted into a matrix, we can apply any matrix operation to it (vectormatrix multiplication, matrix-matrix multiplication, transpose, and so on).

# Exploring the Bag-of-Words architecture

### Take in a list of sentences

In [57]:
sentences = ["We are reading about Natural Language Processing Here",
            "Natural Language Processing making computers comprehend language data",
            "The field of Natural Language Processing is evolving everyday"]

### Create a Pandas Series of the object

In [58]:
corpus = pd.Series(sentences)
corpus

0    We are reading about Natural Language Processi...
1    Natural Language Processing making computers c...
2    The field of Natural Language Processing is ev...
dtype: object

### Data preprocessing


In [59]:
def text_clean(corpus, keep_list):
    '''
    Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)
    
    Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained
            even after the cleaning process
    
    Output : Returns the cleaned text corpus
    
    '''
    cleaned_corpus = pd.Series()
    for row in corpus:
        qs = []
        for word in row.split():
            if word not in keep_list:
                p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
                p1 = p1.lower()
                qs.append(p1)
            else : qs.append(word)
        cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
    return cleaned_corpus

In [60]:
def stopwords_removal(corpus):
    wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
    stop = set(stopwords.words('english'))
    for word in wh_words:
        stop.remove(word)
    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus

def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus

def stem(corpus, stem_type = None):
    if stem_type == 'snowball':
        stemmer = SnowballStemmer(language = 'english')
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    else :
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    return corpus

In [61]:
def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    
    if cleaning == True:
        corpus = text_clean(corpus, keep_list)
    
    if remove_stopwords == True:
        corpus = stopwords_removal(corpus)
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        corpus = lemmatize(corpus)
        
        
    if stemming == True:
        corpus = stem(corpus, stem_type)
    
    corpus = [' '.join(x) for x in corpus]        

    return corpus

In [62]:
common_dot_words = ['U.S.', 'Mr.', 'Mrs.', 'D.C.']

In [63]:
# Preprocessing with Lemmatization here
preprocessed_corpus = preprocess(corpus, keep_list = common_dot_words, stemming = False, stem_type = None, lemmatization = True, remove_stopwords = True)
preprocessed_corpus

  cleaned_corpus = pd.Series()
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))


['read natural language process',
 'natural language process make computers comprehend language data',
 'field natural language process evolve everyday']

### Building the vocabulary


In [64]:
set_of_words = set()
for sentence in preprocessed_corpus:
    for word in sentence.split():
        set_of_words.add(word)
vocab = list(set_of_words)
print(vocab)

['field', 'language', 'read', 'evolve', 'make', 'process', 'comprehend', 'natural', 'computers', 'everyday', 'data']


### Fetching the position of each word in the vocabulary

In [65]:
position = {}
for i, token in enumerate(vocab):
    position[token] = i
print(position)

{'field': 0, 'language': 1, 'read': 2, 'evolve': 3, 'make': 4, 'process': 5, 'comprehend': 6, 'natural': 7, 'computers': 8, 'everyday': 9, 'data': 10}


### Creating a matrix to hold the Bag of Words representation

In [66]:
bow_matrix = np.zeros((len(preprocessed_corpus), len(vocab)))

In [67]:
for i, preprocessed_sentence in enumerate(preprocessed_corpus):
    for token in preprocessed_sentence.split():   
        bow_matrix[i][position[token]] = bow_matrix[i][position[token]] + 1

### Let's look at our Bag of Words representation

In [68]:
bow_matrix

array([[0., 1., 1., 0., 0., 1., 0., 1., 0., 0., 0.],
       [0., 2., 0., 0., 1., 1., 1., 1., 1., 0., 1.],
       [1., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0.]])

## Inference  

Taking example of column 2 in the bow_matrix, the values are 1, 2 and 1 respectively.  

Column 2 caters to index 2 corresponding to the word language.  

language occurs once, twice and again once in the the sentences 1, 2 and 3 respectively.  
  
Hope that provides you insights into how the Bag of Words model works.  

# Term Frequency-Inverse Document Frequency based Vectorizer



In [69]:
import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer


## Building a corpus of sentences


In [70]:
sentences = ["We are reading about Natural Language Processing Here",
            "Natural Language Processing making computers comprehend language data",
            "The field of Natural Language Processing is evolving everyday"]
corpus = pd.Series(sentences)
corpus

0    We are reading about Natural Language Processi...
1    Natural Language Processing making computers c...
2    The field of Natural Language Processing is ev...
dtype: object

## Data preprocessing pipeline

In [72]:
# Preprocessing with Lemmatization here
preprocessed_corpus = preprocess(corpus, keep_list = [], stemming = False, stem_type = None,
                                lemmatization = True, remove_stopwords = True)
preprocessed_corpus

  cleaned_corpus = pd.Series()
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))


['read natural language process',
 'natural language process make computers comprehend language data',
 'field natural language process evolve everyday']

## Tf-IdfVectorizer

In [73]:
vectorizer = TfidfVectorizer()
tf_idf_matrix = vectorizer.fit_transform(preprocessed_corpus)

### Let's what features were obtained and the corresponding TF-IDF matrix

In [74]:
print(vectorizer.get_feature_names_out())
print(tf_idf_matrix.toarray())
print("\nThe shape of the TF-IDF matrix is: ", tf_idf_matrix.shape)


['comprehend' 'computers' 'data' 'everyday' 'evolve' 'field' 'language'
 'make' 'natural' 'process' 'read']
[[0.         0.         0.         0.         0.         0.
  0.41285857 0.         0.41285857 0.41285857 0.69903033]
 [0.40512186 0.40512186 0.40512186 0.         0.         0.
  0.478543   0.40512186 0.2392715  0.2392715  0.        ]
 [0.         0.         0.         0.49711994 0.49711994 0.49711994
  0.29360705 0.         0.29360705 0.29360705 0.        ]]

The shape of the TF-IDF matrix is:  (3, 11)


### Changing the norm to l1, default option is l2 which was used above

Each output row will have unit norm, which can be one of  
l2: Sum of squares of vector elements is 1.  
l1: Sum of absolute values of vector elemen  

In [75]:
vectorizer_l1_norm = TfidfVectorizer(norm="l1")
tf_idf_matrix_l1_norm = vectorizer_l1_norm.fit_transform(preprocessed_corpus)
print(vectorizer_l1_norm.get_feature_names_out())
print(tf_idf_matrix_l1_norm.toarray())
print("\nThe shape of the TF-IDF matrix is: ", tf_idf_matrix_l1_norm.shape)

['comprehend' 'computers' 'data' 'everyday' 'evolve' 'field' 'language'
 'make' 'natural' 'process' 'read']
[[0.         0.         0.         0.         0.         0.
  0.21307663 0.         0.21307663 0.21307663 0.3607701 ]
 [0.1571718  0.1571718  0.1571718  0.         0.         0.
  0.1856564  0.1571718  0.0928282  0.0928282  0.        ]
 [0.         0.         0.         0.2095624  0.2095624  0.2095624
  0.12377093 0.         0.12377093 0.12377093 0.        ]]

The shape of the TF-IDF matrix is:  (3, 11)


## N-grams and Max features with TfidfVectorizer


In [76]:
vectorizer_n_gram_max_features = TfidfVectorizer(norm="l2", analyzer='word', ngram_range=(1,3), max_features = 6)
tf_idf_matrix_n_gram_max_features = vectorizer_n_gram_max_features.fit_transform(preprocessed_corpus)
print(vectorizer_n_gram_max_features.get_feature_names_out())
print(tf_idf_matrix_n_gram_max_features.toarray())
print("\nThe shape of the TF-IDF matrix is: ", tf_idf_matrix_n_gram_max_features.shape)


['language' 'language process' 'natural' 'natural language'
 'natural language process' 'process']
[[0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829]
 [0.66666667 0.33333333 0.33333333 0.33333333 0.33333333 0.33333333]
 [0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829]]

The shape of the TF-IDF matrix is:  (3, 6)


# Cosine Similarity Calculation


In [77]:
def cosine_similarity(vector1, vector2):
    vector1 = np.array(vector1)
    vector2 = np.array(vector2)
    return np.dot(vector1, vector2) / (np.sqrt(np.sum(vector1**2)) * np.sqrt(np.sum(vector2**2)))

## CountVectorizer


In [78]:
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(preprocessed_corpus)

In [79]:
print(vectorizer.get_feature_names_out())
print(bow_matrix.toarray())

['comprehend' 'computers' 'data' 'everyday' 'evolve' 'field' 'language'
 'make' 'natural' 'process' 'read']
[[0 0 0 0 0 0 1 0 1 1 1]
 [1 1 1 0 0 0 2 1 1 1 0]
 [0 0 0 1 1 1 1 0 1 1 0]]


## Cosine similarity between the document vectors built using CountVectorizer

In [80]:
for i in range(bow_matrix.shape[0]):
    for j in range(i + 1, bow_matrix.shape[0]):
        print("The cosine similarity between the documents ", i, "and", j, "is: ",
              cosine_similarity(bow_matrix.toarray()[i], bow_matrix.toarray()[j]))

The cosine similarity between the documents  0 and 1 is:  0.6324555320336759
The cosine similarity between the documents  0 and 2 is:  0.6123724356957946
The cosine similarity between the documents  1 and 2 is:  0.5163977794943223


## TfidfVectorizer

In [81]:
vectorizer = TfidfVectorizer()
tf_idf_matrix = vectorizer.fit_transform(preprocessed_corpus)
print(vectorizer.get_feature_names_out())
print(tf_idf_matrix.toarray())
print("\nThe shape of the TF-IDF matrix is: ", tf_idf_matrix.shape)

['comprehend' 'computers' 'data' 'everyday' 'evolve' 'field' 'language'
 'make' 'natural' 'process' 'read']
[[0.         0.         0.         0.         0.         0.
  0.41285857 0.         0.41285857 0.41285857 0.69903033]
 [0.40512186 0.40512186 0.40512186 0.         0.         0.
  0.478543   0.40512186 0.2392715  0.2392715  0.        ]
 [0.         0.         0.         0.49711994 0.49711994 0.49711994
  0.29360705 0.         0.29360705 0.29360705 0.        ]]

The shape of the TF-IDF matrix is:  (3, 11)


## Cosine similarity between the document vectors built using TfidfVectorizer


In [82]:
for i in range(tf_idf_matrix.shape[0]):
    for j in range(i + 1, tf_idf_matrix.shape[0]):
        print("The cosine similarity between the documents ", i, "and", j, "is: ",
              cosine_similarity(tf_idf_matrix.toarray()[i], tf_idf_matrix.toarray()[j]))

The cosine similarity between the documents  0 and 1 is:  0.39514115766749125
The cosine similarity between the documents  0 and 2 is:  0.36365455673761865
The cosine similarity between the documents  1 and 2 is:  0.2810071916500233


# One Hot Vectors


# We take only 1 sentence as input here


In [83]:
sentence = ["We are reading about Natural Language Processing Here"]
corpus = pd.Series(sentence)
corpus

0    We are reading about Natural Language Processi...
dtype: object

In [84]:
# Preprocessing with Lemmatization here
preprocessed_corpus = preprocess(corpus, keep_list = [], stemming = False, stem_type = None,
                                lemmatization = True, remove_stopwords = True)
preprocessed_corpus

  cleaned_corpus = pd.Series()
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))


['read natural language process']

## Building the vocabulary

In [85]:
set_of_words = set()
for word in preprocessed_corpus[0].split():
    set_of_words.add(word)
vocab = list(set_of_words)
print(vocab)

['read', 'process', 'natural', 'language']


## Fetching the position of each word in the vocabulary


In [87]:
position = {}
for i, token in enumerate(vocab):
    position[token] = i
print(position)

{'read': 0, 'process': 1, 'natural': 2, 'language': 3}


## Instantiating the one hot matrix
### Note here every row in the matrix corresponds to the One Hot vector for an individual term

In [88]:
one_hot_matrix = np.zeros((len(preprocessed_corpus[0].split()), len(vocab)))
one_hot_matrix.shape

(4, 4)

## Building One Hot Vectors


In [90]:
for i, token in enumerate(preprocessed_corpus[0].split()):
    one_hot_matrix[i][position[token]] = 1

## Visualizing the One Hot Vectors


In [91]:
one_hot_matrix

array([[1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.]])

## Inference

The first row corresponds to the One Hot vector of read, second for natural,
third for language and the final one for process based on their respective indices in the vocabulary

# Building a basic chatbot


In [94]:
#!pip install scikit-learn
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer


#loading questions and answers in separate lists
import ast 
questions = []
answers = [] 
with open('qa_Electronics.json','r') as f:
    for line in f:
        data = ast.literal_eval(line)
        questions.append(data['question'].lower())
        answers.append(data['answer'].lower())

In [95]:
# tokenize the text and convert data in matrix format
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X_vec = vectorizer.fit_transform(questions)
# Transform data by applying term frequency inverse document frequency (TF-IDF) 
tfidf = TfidfTransformer() #by default applies "l2" normalization
X_tfidf = tfidf.fit_transform(X_vec)

In [96]:
def conversation(im):
    global tfidf, answers, X_tfidf
    Y_vec = vectorizer.transform(im)
    Y_tfidf = tfidf.fit_transform(Y_vec)
    cos_sim = np.rad2deg(np.arccos(max(cosine_similarity(Y_tfidf, X_tfidf)[0])))
    if cos_sim > 60 :
        return "sorry, I did not quite understand that"
    else:
        return answers[np.argmax(cosine_similarity(Y_tfidf, X_tfidf)[0])]

def main():
    usr = input("Please enter your username: ")
    print("support: Hi, welcome to Q&A support. How can I help you?")
    while True:
        im = input("{}: ".format(usr))
        if im.lower() == 'bye':
            print("Q&A support: bye!")
            break
        else:
            print("Q&A support: "+conversation([im]))

In [97]:
main()

Please enter your username: aditya
support: Hi, welcome to Q&A support. How can I help you?
aditya: tell me about yourself
Q&A support: germany
aditya: how to restart my laptop
Q&A support: hi, you may get you laptop in 3 to 5 business day depending on you location. thanks for you interest. tech mark.
aditya: how are you
Q&A support: sorry, I did not quite understand that
aditya: how can you help me?
Q&A support: i've had the same problem since i plugged it in. a small buzzing sound that gets louder. if i tap the earpiece where the mic arm is attached it goes away briefly. i sent that pair back, but the new set does the exact same thing. any help would be appreciated.
aditya: very good
Q&A support: yes
aditya: comm'n lets go
Q&A support: sorry, I did not quite understand that
aditya: what are you?
Q&A support: sorry, I did not quite understand that
aditya: okay bye
Q&A support: i would have to agree with samuel because most people can just use their iphone or droid phones to shoot grea