In [None]:
# Mount Google Drive
from google.colab import drive # import drive from google colab

ROOT = "/content/drive"     # default location for the drive
print(ROOT)                 # print content of ROOT (Optional)

drive.mount(ROOT)           # we mount the google drive at /content/drive

/content/drive
Mounted at /content/drive


In [None]:
DATA_FOLDER = "/content/drive/My Drive/StackOverflow Assistant Chatbot"

In [None]:
import gensim
import numpy as np
import nltk
import re
from gensim.models import KeyedVectors
from nltk.corpus import stopwords

from sklearn.metrics.pairwise import cosine_similarity

In [None]:
nltk.download('stopwords')

# Word embedding

[Pre-trained word vectors](https://code.google.com/archive/p/word2vec/) from Google which were trained on a part of Google News dataset (about 100 billion words).

In [None]:
# Load word embeddings, a mapping between keys and vectors of 300 dimensions
oWordEmbeddings = KeyedVectors.load_word2vec_format(f'{DATA_FOLDER}/GoogleNews-vectors-negative300.bin.gz', binary=True, limit=500000) 

In [None]:
type(oWordEmbeddings)

gensim.models.keyedvectors.Word2VecKeyedVectors

In [None]:
# Example of most similar word
clMostSimilarWord = oWordEmbeddings.most_similar(positive=['woman', 'king'], negative=['man'])
clMostSimilarWord

[('queen', 0.7118192911148071),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321243286133),
 ('kings', 0.5236844420433044),
 ('queens', 0.518113374710083),
 ('sultan', 0.5098593235015869),
 ('monarchy', 0.5087411999702454),
 ('royal_palace', 0.5087165832519531)]

In [None]:
"""
    Function name: QuestionEmbedding
    
    Objective: Calculate question embedding
    
    Summary algorithmic description: a mean of all word embedding in the question
    
    Input parameters: sQuestion : question to embed
                      oWordEmbeddings : dictionnary where the key is a word and a value is it's embedding
                      iDim : size of the question embedding
    
    Return : question embedding
    
    Date : 28/11/2021
    
    Coding: INSA CVL - Van Tuan BUI  
"""
def QuestionEmbedding(sQuestion, oWordEmbeddings, iDim=300):
    #  question embedding is initialized with filled zeros
    caResult = np.zeros(iDim)
    # Number of embedded words
    iCount = 0
    # Loop over all words of this question
    for sWord in sQuestion.split():
        # If word is embedded
        if sWord in oWordEmbeddings:
            # Add this embedding to question embedding
            caResult += oWordEmbeddings[sWord]
            # Number of embedded words inscrease
            iCount += 1
    # Return a mean of all word embedding in the question
    return caResult / iCount if iCount != 0 else caResult

### Hits@K

The first simple metric will be a number of correct hits for some *K*:
$$ \text{Hits@K} = \frac{1}{N}\sum_{i=1}^N \, [dup_i \in topK(q_i)]$$

where $q_i$ is the i-th query, $dup_i$ is its duplicate, $topK(q_i)$ is the top K elements of the ranked sentences provided by our model and the operation $[dup_i \in topK(q_i)]$ equals 1 if the condition is true and 0 otherwise (more details about this operation could be found [here](https://en.wikipedia.org/wiki/Iverson_bracket)).


### DCG@K
The second one is a simplified [DCG metric](https://en.wikipedia.org/wiki/Discounted_cumulative_gain):

$$ \text{DCG@K} = \frac{1}{N} \sum_{i=1}^N\frac{1}{\log_2(1+rank_{dup_i})}\cdot[rank_{dup_i} \le K] $$

where $rank_{dup_i}$ is a position of the duplicate in the sorted list of the nearest sentences for the query $q_i$. According to this metric, the model gets a higher reward for a higher position of the correct answer. If the answer does not appear in topK at all, the reward is zero. 

In [None]:
"""
    Function name: HitsCount
    
    Objective: metric of correct sentence is the top K elements of the ranked sentences
    
    Summary algorithmic description: 1 if the first duplicate is lower than K
    
    Input parameters: clDuplicateRank : list of duplicates ranks; one rank per question; 
                                        e.g. [2, 3] means that the first duplicate has the rank 2, the second one has the rank 3.
                      iK : number of top-ranked elements (k in Hits@k metric)
    
    Return : Hits@k value for current ranking
    
    Date : 04/12/2021
    
    Coding: INSA CVL - Van Tuan BUI  
"""
def HitsCount(clDuplicateRank, iK):
    return sum(iRank <= iK for iRank in clDuplicateRank) / len(clDuplicateRank)

In [None]:
"""
    Function name: DCGScore
    
    Objective: Calculate the DCG score
    
    Summary algorithmic description: log2(1 + rank) if the first duplicate those rank is lower than K
    
    Input parameters: clDuplicateRank : list of duplicates ranks; one rank per question; 
                                        e.g. [2, 3] means that the first duplicate has the rank 2, the second one has the rank 3.
                      iK : number of top-ranked elements (k in Hits@k metric)
    
    Return : DCG@k value for current ranking
    
    Date : 04/12/2021
    
    Coding: INSA CVL - Van Tuan BUI  
"""
def DCGScore(dup_ranks, k):
    return sum(1 / (np.log2(1 + rank)) for rank in dup_ranks if rank <= k) / len(dup_ranks)

In [None]:
"""
    Function name: ReadCorpus
    
    Objective: Read data corpus
    
    Summary algorithmic description: Read all lines in the file
                                     Add every line to the list
    
    Input parameters: sFilePath : a file path
    
    Return : list of [question, similar question, negative example 1, negative example 2, ...] 
    
    Date : 04/12/2021
    
    Coding: INSA CVL - Van Tuan BUI  
"""
def ReadCorpus(sFilePath):
    # list of [question, similar question, negative example 1, negative example 2, ...] 
    clData = []
    # Loop over all lines of file
    for sLine in open(sFilePath, encoding='utf-8'):
        # Add [question, similar question, negative example 1, negative example 2, ...] to the list
        clData.append(sLine.strip().split('\t'))
    # Return list of [question, similar question, negative example 1, negative example 2, ...] 
    return clData

In [None]:
# Read validation corpus
clValData = ReadCorpus(f'{DATA_FOLDER}/validation.tsv') 

In [None]:
"""
    Function name: RankCandidates
    
    Objective: Rank candidates in a sorted list of pairs
    
    Summary algorithmic description: Compute cosine similarity between question and its candidates.
                                     Sort the list depend on its cosine similarities
    
    Input parameters: sQuestion : a question
                      clCandidates : a list of candidate questions which we want to rank
                      oWordEmbeddings : words embedding
                      iDim : dimension of the current embeddings
    
    Return : a sorted list of pairs (initial position in candidates list, candidate); the first is the best
    
    Date : 04/12/2021
    
    Coding: INSA CVL - Van Tuan BUI  
"""
def RankCandidates(sQuestion, clCandidates, oWordEmbeddings, iDim=300):
    # question embedding
    caQuestionEmbedding = QuestionEmbedding(sQuestion, oWordEmbeddings, iDim)[np.newaxis, :]
    # embeddings of candidate questions 
    caQuestionCandidateEmbeddings = np.array([QuestionEmbedding(sCandidateQuestion, oWordEmbeddings, iDim) for sCandidateQuestion in clCandidates])
    # Compute cosine similarity between question and its candidates.
    caCosineSimilarity = cosine_similarity(caQuestionEmbedding, caQuestionCandidateEmbeddings)[0]
    # the indices that would sort the similarities
    caSortedIndex = np.argsort(caCosineSimilarity)[::-1]
    # a sorted list of pairs (initial position in candidates list, candidate); the first is the best
    return [(iIndex, clCandidates[iIndex]) for iIndex in caSortedIndex]

In [None]:
# list of ranks of smilar questions
clSimilarQuestionRank = []
# Loop over all questions in validattion set
for clData in clValData:
    # question and its duplicated candidates 
    sQuestion, *clCandidates = clData
    # a sorted list of pairs (initial position in candidates list, candidate); the first is the best
    clCandidateRank = RankCandidates(sQuestion, clCandidates, oWordEmbeddings)
    # Add rank of smilar question to the list
    clSimilarQuestionRank.append([ctPair[0] for ctPair in clCandidateRank].index(0) + 1)

In [None]:
# Different values of k
for iK in [1, 5, 10, 100, 500, 1000]:
    # DCG score and hits count
    print("DCG@%4d: %.3f | Hits@%4d: %.3f" % (iK, DCGScore(clSimilarQuestionRank, iK), iK, HitsCount(clSimilarQuestionRank, iK)))

DCG@   1: 0.209 | Hits@   1: 0.209
DCG@   5: 0.263 | Hits@   5: 0.311
DCG@  10: 0.279 | Hits@  10: 0.360
DCG@ 100: 0.316 | Hits@ 100: 0.548
DCG@ 500: 0.349 | Hits@ 500: 0.807
DCG@1000: 0.369 | Hits@1000: 1.000


In [None]:
# special characters replaced by space
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
# Remove characters that are not 0-9, a-z, ' ', #, +, _
GOOD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
# stop words
STOPWORDS = set(stopwords.words('english'))

In [None]:
"""
    Function name: CleanRawText
    
    Objective: Clean a raw text
    
    Summary algorithmic description: All characters in text are lower case
                                     Remove characters that are not 0-9, a-z, ' ', #, +, _
                                     Remove stop words
    
    Input parameters: sText : a text
    
    Return : the preprocessed text
    
    Date : 04/12/2021
    
    Coding: INSA CVL - Van Tuan BUI  
"""
def CleanRawText(sText):
    # all characters in sText are lower case 
    sText = sText.lower()
    # special characters replaced by space
    sText = REPLACE_BY_SPACE_RE.sub(' ', sText)
    # Remove characters that are not 0-9, a-z, ' ', #, +, _
    sText = GOOD_SYMBOLS_RE.sub('', sText)
    # Remove stop words
    sText = ' '.join([sWord for sWord in sText.split() if sWord and sWord not in STOPWORDS])
    # Return preprocessed text
    return sText.strip()

In [None]:
# cleaned validation data
clCleanedValData = []
# Loop over all datas in validation data
for clData in clValData:
    # Clean every question in data
    clCleanedValData.append([CleanRawText(sQuestion) for sQuestion in clData])

In [None]:
# list of ranks of smilar cleaned questions
clSimilarCleanedQuestionRank = []
# Loop over all questions in cleaned validattion set
for clData in clCleanedValData:
    # question and its duplicated candidates 
    sQuestion, *clCandidates = clData
    # a sorted list of pairs (initial position in candidates list, candidate); the first is the best
    clCandidateRank = RankCandidates(sQuestion, clCandidates, oWordEmbeddings)
    # Add rank of smilar question to the list
    clSimilarCleanedQuestionRank.append([ctPair[0] for ctPair in clCandidateRank].index(0) + 1)

In [None]:
# Different values of k
for iK in [1, 5, 10, 100, 500, 1000]:
    # DCG score and hits count
    print("DCG@%4d: %.3f | Hits@%4d: %.3f" % (iK, DCGScore(clSimilarCleanedQuestionRank, iK), iK, HitsCount(clSimilarCleanedQuestionRank, iK)))

DCG@   1: 0.305 | Hits@   1: 0.305
DCG@   5: 0.375 | Hits@   5: 0.438
DCG@  10: 0.392 | Hits@  10: 0.489
DCG@ 100: 0.425 | Hits@ 100: 0.656
DCG@ 500: 0.447 | Hits@ 500: 0.830
DCG@1000: 0.465 | Hits@1000: 1.000


## Representations using StarSpace

In [None]:
"""
    Function name: CleanTextsInFile
    
    Objective: Clean texts in a file
    
    Summary algorithmic description: Open cleaned file
                                     Write cleaned texts from file in the cleaned file

    Input parameters: sFilePath : a file path to clean texts
                      sCleanedFilePath : a cleaned file from the source file
    
    Return : None
    
    Date : 04/12/2021
    
    Coding: INSA CVL - Van Tuan BUI  
"""
def CleanTextsInFile(sFilePath, sCleanedFilePath):
    # Open cleaned file
    oCleanedFilePath = open(sCleanedFilePath, 'w')
    # Loop over all datas in file
    for sData in open(sFilePath, encoding='utf8'):
        # Read every text in data
        clData = sData.strip().split('\t')
        # Clean every text in data
        clCleanedText = [CleanRawText(sText) for sText in clData]
        # Write cleaned texts in the cleaned file
        print(*clCleanedText, sep='\t', file=oCleanedFilePath)
    # Close cleaned file
    oCleanedFilePath.close()

In [None]:
CleanTextsInFile(f'{DATA_FOLDER}/train.tsv', f'{DATA_FOLDER}/cleaned_train.tsv')

In [None]:
!git clone https://github.com/facebookresearch/Starspace.git
%cd Starspace
!make

fatal: destination path 'Starspace' already exists and is not an empty directory.
/content/Starspace
g++ -pthread -std=gnu++11 -O3 -funroll-loops -g -c src/utils/normalize.cpp
g++ -pthread -std=gnu++11 -O3 -funroll-loops -I/usr/local/bin/boost_1_63_0/ -g -c src/dict.cpp
g++ -pthread -std=gnu++11 -O3 -funroll-loops -g -c src/utils/args.cpp
g++ -pthread -std=gnu++11 -O3 -funroll-loops -I/usr/local/bin/boost_1_63_0/ -g -c src/proj.cpp
g++ -pthread -std=gnu++11 -O3 -funroll-loops -I/usr/local/bin/boost_1_63_0/ -g -c src/parser.cpp -o parser.o
g++ -pthread -std=gnu++11 -O3 -funroll-loops -I/usr/local/bin/boost_1_63_0/ -g -c src/data.cpp -o data.o
g++ -pthread -std=gnu++11 -O3 -funroll-loops -I/usr/local/bin/boost_1_63_0/ -g -c src/model.cpp
g++ -pthread -std=gnu++11 -O3 -funroll-loops -I/usr/local/bin/boost_1_63_0/ -g -c src/starspace.cpp
g++ -pthread -std=gnu++11 -O3 -funroll-loops -I/usr/local/bin/boost_1_63_0/ -g -c src/doc_parser.cpp -o doc_parser.o
g++ -pthread -std=gnu++11 -O3 -funrol

In [None]:
######### TRAINING HAPPENING HERE #############
! ./starspace train -trainFile '{DATA_FOLDER}/cleaned_train.tsv' -model StarSpace_embeddings \
-trainMode 3 \
-adagrad true \
-ngrams 1 \
-epoch 5 \
-dim 100 \
-similarity cosine \
-minCount 2 \
-verbose true \
-fileFormat labelDoc \
-negSearchLimit 10 \
-lr 0.05

Arguments: 
lr: 0.05
dim: 100
epoch: 5
maxTrainTime: 8640000
validationPatience: 10
saveEveryEpoch: 0
loss: hinge
margin: 0.05
similarity: cosine
maxNegSamples: 10
negSearchLimit: 10
batchSize: 5
thread: 10
minCount: 2
minCountLabel: 1
label: __label__
label: __label__
ngrams: 1
bucket: 2000000
adagrad: 1
trainMode: 3
fileFormat: labelDoc
normalizeText: 0
dropoutLHS: 0
dropoutRHS: 0
useWeight: 0
weightSep: :
Start to initialize starspace model.
Build dict from input file : /content/drive/My Drive/StackOverflow Assistant Chatbot/cleaned_train.tsv
Read 12M words
Number of words in dictionary:  95058
Number of labels in dictionary: 0
Loading data from file : /content/drive/My Drive/StackOverflow Assistant Chatbot/cleaned_train.tsv
Total number of examples loaded : 999740
Initialized model weights. Model size :
matrix : 95058 100
Training epoch 0: 0.05 0.01
Epoch: 100.0%  lr: 0.040000  loss: 0.043943  eta: 0h13m  tot: 0h3m16s  (20.0%)
 ---+++                Epoch    0 Train error : 0.04383

In [None]:
# a mapping between keys and vectors of 100 dimensions
cdStarspaceEmbedding = {}
# Open file starspace embedding
with open('/content/Starspace/StarSpace_embeddings.tsv', encoding='utf-8') as f:  
    # Read every line in file
    for line in f.readlines():
        # Separate key and vector
        clLine = line.strip().split('\t')
        # Add key and vector embedding into the dictionnary
        cdStarspaceEmbedding[clLine[0]] = np.array(clLine[1:], dtype=np.float32)  

In [None]:
# list of ranks of smilar cleaned questions by using Starspace embedding
clStarspaceSimilarCleanedQuestionRank = []
# Loop over all questions in cleaned validation set
for clData in clCleanedValData:
    # question and its duplicated candidates 
    sQuestion, *clCandidates = clData
    # a sorted list of pairs (initial position in candidates list, candidate); the first is the best
    clCandidateRank = RankCandidates(sQuestion, clCandidates, cdStarspaceEmbedding, 100)
    # Add rank of similar question to the list
    clStarspaceSimilarCleanedQuestionRank.append([ctPair[0] for ctPair in clCandidateRank].index(0) + 1)

In [None]:
# Different values of k
for iK in [1, 5, 10, 100, 500, 1000]:
    # DCG score and hits count
    print("DCG@%4d: %.3f | Hits@%4d: %.3f" % (iK, DCGScore(clStarspaceSimilarCleanedQuestionRank, iK), iK, HitsCount(clStarspaceSimilarCleanedQuestionRank, iK)))

DCG@   1: 0.520 | Hits@   1: 0.520
DCG@   5: 0.615 | Hits@   5: 0.698
DCG@  10: 0.634 | Hits@  10: 0.756
DCG@ 100: 0.665 | Hits@ 100: 0.904
DCG@ 500: 0.675 | Hits@ 500: 0.979
DCG@1000: 0.677 | Hits@1000: 1.000


In [None]:
!cp '/content/Starspace/StarSpace_embeddings.tsv' "/content/drive/My Drive/StackOverflow Assistant Chatbot"