In [1]:
# Mount Google Drive
from google.colab import drive # import drive from google colab

ROOT = "/content/drive"     # default location for the drive
print(ROOT)                 # print content of ROOT (Optional)

drive.mount(ROOT)           # we mount the google drive at /content/drive

/content/drive
Mounted at /content/drive


In [2]:
!pip install chatterbot
!pip install chatterbot_corpus

Collecting chatterbot
  Downloading ChatterBot-1.0.8-py2.py3-none-any.whl (63 kB)
[?25l[K     |█████▏                          | 10 kB 26.2 MB/s eta 0:00:01[K     |██████████▎                     | 20 kB 27.8 MB/s eta 0:00:01[K     |███████████████▌                | 30 kB 11.2 MB/s eta 0:00:01[K     |████████████████████▋           | 40 kB 9.3 MB/s eta 0:00:01[K     |█████████████████████████▊      | 51 kB 5.4 MB/s eta 0:00:01[K     |███████████████████████████████ | 61 kB 5.9 MB/s eta 0:00:01[K     |████████████████████████████████| 63 kB 1.5 MB/s 
Collecting sqlalchemy<1.4,>=1.3
  Downloading SQLAlchemy-1.3.24-cp37-cp37m-manylinux2010_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 11.7 MB/s 
[?25hCollecting mathparse<0.2,>=0.1
  Downloading mathparse-0.1.2-py3-none-any.whl (7.2 kB)
Installing collected packages: sqlalchemy, mathparse, chatterbot
  Attempting uninstall: sqlalchemy
    Found existing installation: SQLAlchemy 1.4.27
    Uninstallin

In [3]:
import numpy as np
import pandas as pd
import pickle
import re
import nltk
import os

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import pairwise_distances_argmin

from chatterbot import ChatBot
from chatterbot.trainers import ChatterBotCorpusTrainer

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
DATA_FOLDER = "/content/drive/My Drive/StackOverflow Assistant Chatbot"

# Intent recognition

We will do a binary classification on TF-IDF representations of texts. Labels will be either dialogue for general questions or stackoverflow for programming-related questions.

In [6]:
# Load TF-IDF features and tags
caTrainFeatureTFIDF, caTestFeatureTFIDF, clTrainTag, clTestTag = pickle.load(open(f'{DATA_FOLDER}/tfidf_features.pkl', 'rb')) 

In [7]:
# Logistic Regression classifier
oIntentRecognizer = LogisticRegression(penalty='l2', C=10, random_state=0, solver='liblinear')
# Fit the model according to the given training data
oIntentRecognizer.fit(caTrainFeatureTFIDF, clTrainTag)

LogisticRegression(C=10, random_state=0, solver='liblinear')

In [8]:
# Predict class labels for test set
caTestTagPrediction = oIntentRecognizer.predict(caTestFeatureTFIDF)
# Accuracy classification score
fTestAccuracy = accuracy_score(clTestTag, caTestTagPrediction)
print('Test accuracy = {}'.format(fTestAccuracy))

Test accuracy = 0.991575


In [9]:
# Dump the classifier
pickle.dump(oIntentRecognizer, open(f'{DATA_FOLDER}/intent_recognizer.pkl', 'wb'))

# Programming language classification

We will train one more classifier for the programming-related questions. It will predict exactly one tag (=programming language) and will be also based on Logistic Regression with TF-IDF features.

In [10]:
# Load cleaned sample data
dfDialogue, dfStackOverflow = pickle.load(open(f'{DATA_FOLDER}/cleaned_sample_data.pkl', 'rb')) 

In [11]:
# StackOverflow titles
caStackOverflowTitle = dfStackOverflow['title'].values
# StackOverflow tags
caStackOverflowTag = dfStackOverflow['tag'].values

In [12]:
# Split StackOverflow datas into random train and test subsets
caTrainStackOverflowTitle, caTestStackOverflowTitle, caTrainStackOverflowTag, caTestStackOverflowTag = train_test_split(caStackOverflowTitle, caStackOverflowTag, test_size=0.2, random_state=0)
print('Train size = {}, test size = {}'.format(len(caTrainStackOverflowTitle), len(caTestStackOverflowTitle)))

Train size = 160000, test size = 40000


In [13]:
# reuse the TF-IDF vectorizer
oTFIDFVectorizer = pickle.load(open(f'{DATA_FOLDER}/tfidf_vectorizer.pkl', 'rb'))
# Transform documents to document-term matrix
caTrainTFIDFFeature, caTestTFIDFFeature = oTFIDFVectorizer.transform(caTrainStackOverflowTitle), oTFIDFVectorizer.transform(caTestStackOverflowTitle)

In [14]:
# tag classifier using OneVsRestClassifier wrapper over LogisticRegression
oTagClassifier = OneVsRestClassifier(LogisticRegression(penalty='l2', C=5, random_state=0, solver='liblinear'))
# Fit the model
oTagClassifier.fit(caTrainTFIDFFeature, caTrainStackOverflowTag)

OneVsRestClassifier(estimator=LogisticRegression(C=5, random_state=0,
                                                 solver='liblinear'))

In [15]:
# Predict class labels for test set
caTestTagPrediction = oTagClassifier.predict(caTestTFIDFFeature)
# Accuracy classification score
fTestAccuracy = accuracy_score(caTestStackOverflowTag, caTestTagPrediction)
print('Test accuracy = {}'.format(fTestAccuracy))

Test accuracy = 0.800725


In [16]:
# Dump tag classifier
pickle.dump(oTagClassifier, open(f'{DATA_FOLDER}/tag_classifier.pkl', 'wb'))

# Dialogue Manager

In [25]:
# special characters replaced by space
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
# Remove characters that are not 0-9, a-z, ' ', #, +, _
GOOD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
# stop words
STOPWORDS = set(stopwords.words('english'))

In [19]:
"""
    Function name: CleanRawText
    
    Objective: Clean a raw text
    
    Summary algorithmic description: All characters in text are lower case
                                     Remove characters that are not 0-9, a-z, ' ', #, +, _
                                     Remove stop words
    
    Input parameters: sText : a text
    
    Return : the preprocessed text
    
    Date : 04/12/2021
    
    Coding: INSA CVL - Van Tuan BUI  
"""
def CleanRawText(sText):
    # all characters in sText are lower case 
    sText = sText.lower()
    # special characters replaced by space
    sText = REPLACE_BY_SPACE_RE.sub(' ', sText)
    # Remove characters that are not 0-9, a-z, ' ', #, +, _
    sText = GOOD_SYMBOLS_RE.sub('', sText)
    # Remove stop words
    sText = ' '.join([sWord for sWord in sText.split() if sWord and sWord not in STOPWORDS])
    # Return preprocessed text
    return sText.strip()

In [27]:
"""
    Function name: QuestionEmbedding
    
    Objective: Calculate question embedding
    
    Summary algorithmic description: a mean of all word embedding in the question
    
    Input parameters: sQuestion : question to embed
                      oWordEmbeddings : dictionnary where the key is a word and a value is it's embedding
                      iDim : size of the question embedding
    
    Return : question embedding
    
    Date : 28/11/2021
    
    Coding: INSA CVL - Van Tuan BUI  
"""
def QuestionEmbedding(sQuestion, oWordEmbeddings, iDim=300):
    #  question embedding is initialized with filled zeros
    caResult = np.zeros(iDim)
    # Number of embedded words
    iCount = 0
    # Loop over all words of this question
    for sWord in sQuestion.split():
        # If word is embedded
        if sWord in oWordEmbeddings:
            # Add this embedding to question embedding
            caResult += oWordEmbeddings[sWord]
            # Number of embedded words inscrease
            iCount += 1
    # Return a mean of all word embedding in the question
    return caResult / iCount if iCount != 0 else caResult

In [22]:
"""
    Function name: LoadEmbeddings
    
    Objective: Load pre-trained word embeddings from tsv file
    
    Summary algorithmic description: Load pre-trained word embeddings from tsv file into a dict
    
    Input parameters: sEmbeddingsPath : path to the embeddings file
    
    Return : dict mapping words to vectors and dimension of the vectors
    
    Date : 06/12/2021
    
    Coding: INSA CVL - Van Tuan BUI  
"""
def LoadEmbeddings(sEmbeddingsPath):
    # a mapping between keys and vectors 
    cdEmbedding = {}
    # Open file embedding
    with open(sEmbeddingsPath, encoding='utf-8') as f:  
        # Read every line in file
        for line in f.readlines():
            # Separate key and vector
            clLine = line.strip().split('\t')
            # Add key and vector embedding into the dictionnary
            cdEmbedding[clLine[0]] = np.array(clLine[1:], dtype=np.float32)  
    
    # dimension of the vectors
    iEmbeddingDim = cdEmbedding[list(cdEmbedding)[0]].shape[0]
    # Return dict mapping words to vectors and dimension of the vectors
    return cdEmbedding, iEmbeddingDim

In [17]:
"""
    Class name: ThreadRanker 
    
    Objective: Find post id of the most similar thread for the question
    
    Summary algorithmic description: Load title embeddings of the tag
                                     Find post id of the most similar thread for the question
    
    Input parameters: None
    
    Date : 06/12/2021
    
    Coding: INSA CVL - Van Tuan BUI  
"""
class ThreadRanker(object):
    def __init__(self):
        # Load pre-trained starspace word embeddings from tsv file
        self.cdWordEmbedding, self.iEmbeddingsDim = LoadEmbeddings(f'{DATA_FOLDER}/StarSpace_embeddings.tsv')
        # thread embedding folder by tags
        self.sThreadEmbeddingsFolder = f'{DATA_FOLDER}/thread_embeddings_by_tags'


    """
        Function name: __load_embeddings_by_tag

        Objective: Load title embeddings of the tag

        Summary algorithmic description: Load thread embedding file by tag

        Input parameters: sTagName : tag name

        Return : post Ids of the tag and a matrix where embeddings for each title are stored
    """
    def __load_embeddings_by_tag(self, sTagName):
        # tag embedding file
        sEmbeddingsPath = os.path.join(self.sThreadEmbeddingsFolder, sTagName + ".pkl")
        # Load tag embedding
        caTagPostId, caTagVector = pickle.load(open(sEmbeddingsPath, 'rb'))
        # Return post Ids of the tag and a matrix where embeddings for each title are stored.
        return caTagPostId, caTagVector


    """
        Function name: get_best_thread

        Objective: Find post id of the most similar thread for the question

        Summary algorithmic description: Calculate question embedding
                                         Search the most similar thread for the question across the threads with a given tag

        Input parameters: sQuestion : a question
                          sTagName : tag name

        Return : post id of the most similar thread for the question
    """
    def get_best_thread(self, sQuestion, sTagName):
        # Load title embeddings of the tag
        caTagPostId, caTagVector = self.__load_embeddings_by_tag(sTagName)
        # question embedding
        caQuestionEmbedding = QuestionEmbedding(sQuestion, self.cdWordEmbedding, self.iEmbeddingsDim).reshape(1, -1)
        # index of the post which is the most similar thread for the question
        iBestThreadIndex = pairwise_distances_argmin(caQuestionEmbedding, caTagVector) 
        # Return id of the most similar thread for the question
        return caTagPostId[iBestThreadIndex]

In [20]:
"""
    Class name: DialogueManager 
    
    Objective: Train the chatbot model to generate a reponse
    
    Summary algorithmic description: Initialize and train chatbot model
                                     Generate answer for Chit-chat part and programming-related question
    
    Input parameters: None
    
    Date : 06/12/2021
    
    Coding: INSA CVL - Van Tuan BUI  
"""
class DialogueManager(object):
    def __init__(self):
        print("Loading resources...")

        # Intent recognition
        self.oIntentRecognizer = pickle.load(open(f'{DATA_FOLDER}/intent_recognizer.pkl', 'rb')) 
        # TF-IDF vectorizer
        self.oTFIDFVectorizer = pickle.load(open(f'{DATA_FOLDER}/tfidf_vectorizer.pkl', 'rb')) 
        # answer template
        self.ANSWER_TEMPLATE = 'I think its about %s\nThis thread might help you: https://stackoverflow.com/questions/%s'

        # tag classifier
        self.oTagClassifier = pickle.load(open(f'{DATA_FOLDER}/tag_classifier.pkl', 'rb')) 
        # Find post id of the most similar thread for the question
        self.oThreadRanker = ThreadRanker()
        # Initialize and train the chatbot model
        self.__init_chitchat_bot()


    """
        Function name: __init_chitchat_bot

        Objective: Initialize and train chatbot model

        Summary algorithmic description: Create an instance of the ChatBot class
                                         Create a new trainer for the chatbot
                                         Train the chatbot based on the english corpus

        Input parameters: None

        Return : None
    """
    def __init_chitchat_bot(self):
        # Create an instance of the ChatBot class
        oChatbot = ChatBot('StackOverflow Assistance')
        # Create a new trainer for the chatbot
        oTrainer = ChatterBotCorpusTrainer(oChatbot)
        # Train the chatbot based on the english corpus
        oTrainer.train("chatterbot.corpus.english")
        # Chatbot model
        self.oChitchatBot = oChatbot


    """
        Function name: generate_answer

        Objective: Generate answer for Chit-chat part and programming-related question

        Summary algorithmic description: Recognize intent of the question using `intent_recognizer`
                                         Pass question to chitchat_bot to generate a response
                                         or to ThreadRanker to find post id of the most similar thread for the question

        Input parameters: sQuestion : a question

        Return : an answer
    """       
    def generate_answer(self, sQuestion):
        # Clean a raw question
        sCleanedQuestion = CleanRawText(sQuestion) 
        # TF-IDF features of this question
        caTFIDFFeature = self.oTFIDFVectorizer.transform([sCleanedQuestion]) 
        # Intent recognition; Labels will be either dialogue or stackoverflow for programming-related questions.
        sIntent = self.oIntentRecognizer.predict(caTFIDFFeature) 

        # Chit-chat part: dialogue   
        if sIntent == 'dialogue':
            # Pass question to chitchat_bot to generate a response.       
            sResponse = self.oChitchatBot.get_response(sQuestion)
            # Return dialogue answer 
            return sResponse
        
        # Goal-oriented part: programming-related questions
        else:        
            # Pass features to tag_classifier to get predictions.
            sTag = self.oTagClassifier.predict(caTFIDFFeature)[0] 
            
            # Pass cleaned question to thread_ranker to get predictions.
            iThreadId = self.oThreadRanker.get_best_thread(sQuestion, sTag)[0] 
            # Return answer of programming-related question
            return self.ANSWER_TEMPLATE % (sTag, iThreadId)

In [23]:
# Train the chatbot model to generate a reponse
oDialogueManager = DialogueManager()

Loading resources...
Training ai.yml: [####################] 100%
Training botprofile.yml: [####################] 100%
Training computers.yml: [####################] 100%
Training conversations.yml: [####################] 100%
Training emotion.yml: [####################] 100%
Training food.yml: [####################] 100%
Training gossip.yml: [####################] 100%
Training greetings.yml: [####################] 100%
Training health.yml: [####################] 100%
Training history.yml: [####################] 100%
Training humor.yml: [####################] 100%
Training literature.yml: [####################] 100%
Training money.yml: [####################] 100%
Training movies.yml: [####################] 100%
Training politics.yml: [####################] 100%
Training psychology.yml: [####################] 100%
Training science.yml: [####################] 100%
Training sports.yml: [####################] 100%
Training trivia.yml: [####################] 100%


In [38]:
# list of questions
clQuestion = [
    "Hey", 
    "How are you doing?", 
    "What's your hobby?", 
    "How to write a loop in python?",
    "How to delete rows in pandas?",
    "python3 re",
    "What is the difference between c and c++",
    "Multithreading in Java",
    "Catch exceptions C++",
    "What is AI?",
]

# Loop over all test questions
for sQuestion in clQuestion:
    # Generate answer for Chit-chat part and programming-related question
    sAnswer = oDialogueManager.generate_answer(sQuestion) 
    print('Q: %s\nA: %s \n' % (sQuestion, sAnswer))

Q: Hey
A: Which is your favourite soccer club? 

Q: How are you doing?
A: I am doing well. 

Q: What's your hobby?
A: you act like a child 

Q: How to write a loop in python?
A: I think its about python
This thread might help you: https://stackoverflow.com/questions/26736277 

Q: How to delete rows in pandas?
A: I think its about python
This thread might help you: https://stackoverflow.com/questions/24612584 

Q: python3 re
A: I think its about python
This thread might help you: https://stackoverflow.com/questions/10769394 

Q: What is the difference between c and c++
A: I think its about c_cpp
This thread might help you: https://stackoverflow.com/questions/25180069 

Q: Multithreading in Java
A: I think its about java
This thread might help you: https://stackoverflow.com/questions/8318 

Q: Catch exceptions C++
A: I think its about c_cpp
This thread might help you: https://stackoverflow.com/questions/336475 

Q: What is AI?
A: I think its about java
This thread might help you: https:/