In [None]:
# Mount Google Drive
from google.colab import drive # import drive from google colab

ROOT = "/content/drive"     # default location for the drive
print(ROOT)                 # print content of ROOT (Optional)

drive.mount(ROOT)           # we mount the google drive at /content/drive

/content/drive
Mounted at /content/drive


In [None]:
DATA_FOLDER = "/content/drive/My Drive/StackOverflow Assistant Chatbot/"

In [None]:
import numpy as np
import pandas as pd
import pickle
import re
import nltk
import os

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# sample size
SAMPLE_SIZE = 200000

# dialogue phrases from movie subtitles (negative samples)
dfDialogue = pd.read_csv(f'{DATA_FOLDER}/dialogues.tsv', sep='\t').sample(SAMPLE_SIZE, random_state=0)
# StackOverflow posts, tagged with one programming language (positive samples)
dfStackOverflow = pd.read_csv(f'{DATA_FOLDER}/tagged_posts.tsv', sep='\t').sample(SAMPLE_SIZE, random_state=0)

In [None]:
dfDialogue.head()

Unnamed: 0,text,tag
82925,"Donna, you are a muffin.",dialogue
48774,He was here last night till about two o'clock....,dialogue
55394,"All right, then make an appointment with her s...",dialogue
90806,"Hey, what is this-an interview? We're supposed...",dialogue
107758,Yeah. He's just a friend of mine I was trying ...,dialogue


In [None]:
dfStackOverflow.head()

Unnamed: 0,post_id,title,tag
2168983,43837842,Efficient Algorithm to compose valid expressio...,python
1084095,15747223,Why does this basic thread program fail with C...,c_cpp
1049020,15189594,Link to scroll to top not working,javascript
200466,3273927,Is it possible to implement ping on windows ph...,c#
1200249,17684551,GLSL normal mapping issue,c_cpp


# Preprocess the data

In [None]:
# special characters replaced by space
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
# Remove characters that are not 0-9, a-z, ' ', #, +, _
GOOD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
# stop words
STOPWORDS = set(stopwords.words('english'))

In [None]:
"""
    Function name: CleanRawText
    
    Objective: Clean a raw text
    
    Summary algorithmic description: All characters in text are lower case
                                     Remove characters that are not 0-9, a-z, ' ', #, +, _
                                     Remove stop words
    
    Input parameters: sText : a text
    
    Return : the preprocessed text
    
    Date : 04/12/2021
    
    Coding: INSA CVL - Van Tuan BUI  
"""
def CleanRawText(sText):
    # all characters in sText are lower case 
    sText = sText.lower()
    # special characters replaced by space
    sText = REPLACE_BY_SPACE_RE.sub(' ', sText)
    # Remove characters that are not 0-9, a-z, ' ', #, +, _
    sText = GOOD_SYMBOLS_RE.sub('', sText)
    # Remove stop words
    sText = ' '.join([sWord for sWord in sText.split() if sWord and sWord not in STOPWORDS])
    # Return preprocessed text
    return sText.strip()

In [None]:
# Clean dialogue phrases
dfDialogue['text'] = dfDialogue['text'].apply(CleanRawText) 
# Clean StackOverflow titles
dfStackOverflow['title'] = dfStackOverflow['title'].apply(CleanRawText) 

In [None]:
# Save the cleaned data
pickle.dump((dfDialogue, dfStackOverflow), open(f'{DATA_FOLDER}/cleaned_sample_data.pkl', 'wb'))

# TF-IDF features

In [None]:
"""
    Function name: TfidfFeatures
    
    Objective: Perform TF-IDF transformation and dump the model
    
    Summary algorithmic description: 

    Input parameters: caTrainData : training data
                      caTestData : test data
                      sVectorizerPath : file path of TF-IDF features 
    
    Return : trainning TF-IDF features and test TF-IDF features
    
    Date : 05/12/2021
    
    Coding: INSA CVL - Van Tuan BUI  
"""
def TfidfFeatures(caTrainData, caTestData, sVectorizerPath):
    # a matrix of TF-IDF features : we filter out too rare words (occur less than in 5 titles) 
    # and too frequent words (occur more than in 90% of the titles)
    # Also, use bigrams along with unigrams in your vocabulary.
    oTfidfVectorizer = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1, 2), token_pattern='(\S+)')
    # Learn vocabulary and idf
    caTrainFeature = oTfidfVectorizer.fit_transform(caTrainData)
    # Transform documents to document-term matrix
    caTestFeture = oTfidfVectorizer.transform(caTestData)
    # Dump the model
    with open(sVectorizerPath, 'wb') as f:
        pickle.dump(oTfidfVectorizer, f)
    # Return TF-IDF vectorized representation of train and test set
    return caTrainFeature, caTestFeture

In [None]:
# concatenate dialogue and stackoverflow examples into one sample
caData = np.concatenate([dfDialogue['text'].values, dfStackOverflow['title'].values])
# concatenate dialogue and stackoverflow example tags into one sample
clTrainTag = ['dialogue'] * dfDialogue.shape[0] + ['stackoverflow'] * dfStackOverflow.shape[0]
# split it into train and test in proportion 9:1
caTrainData, caTestData, clTrainTag, clTestTag = train_test_split(caData, clTrainTag, test_size=0.1, random_state=0) 
print('Train size = {}, test size = {}'.format(len(caTrainData), len(caTestData)))
# transform it into TF-IDF features
caTrainFeatureTFIDF, caTestFeatureTFIDF = TfidfFeatures(caTrainData, caTestData, f'{DATA_FOLDER}/tfidf_vectorizer.pkl') 

Train size = 360000, test size = 40000


In [None]:
# Save TF-IDF features and tags
pickle.dump((caTrainFeatureTFIDF, caTestFeatureTFIDF, clTrainTag, clTestTag), open(f'{DATA_FOLDER}/tfidf_features.pkl', 'wb'))

# Thread Embeddings by tag

In [None]:
"""
    Function name: QuestionEmbedding
    
    Objective: Calculate question embedding
    
    Summary algorithmic description: a mean of all word embedding in the question
    
    Input parameters: sQuestion : question to embed
                      oWordEmbeddings : dictionnary where the key is a word and a value is it's embedding
                      iDim : size of the question embedding
    
    Return : question embedding
    
    Date : 28/11/2021
    
    Coding: INSA CVL - Van Tuan BUI  
"""
def QuestionEmbedding(sQuestion, oWordEmbeddings, iDim=300):
    #  question embedding is initialized with filled zeros
    caResult = np.zeros(iDim)
    # Number of embedded words
    iCount = 0
    # Loop over all words of this question
    for sWord in sQuestion.split():
        # If word is embedded
        if sWord in oWordEmbeddings:
            # Add this embedding to question embedding
            caResult += oWordEmbeddings[sWord]
            # Number of embedded words inscrease
            iCount += 1
    # Return a mean of all word embedding in the question
    return caResult / iCount if iCount != 0 else caResult

In [None]:
"""
    Function name: LoadEmbeddings
    
    Objective: Load pre-trained word embeddings from tsv file
    
    Summary algorithmic description: Load pre-trained word embeddings from tsv file into a dict
    
    Input parameters: sEmbeddingsPath : path to the embeddings file
    
    Return : dict mapping words to vectors and dimension of the vectors
    
    Date : 06/12/2021
    
    Coding: INSA CVL - Van Tuan BUI  
"""
def LoadEmbeddings(sEmbeddingsPath):
    # a mapping between keys and vectors 
    cdEmbedding = {}
    # Open file embedding
    with open(sEmbeddingsPath, encoding='utf-8') as f:  
        # Read every line in file
        for line in f.readlines():
            # Separate key and vector
            clLine = line.strip().split('\t')
            # Add key and vector embedding into the dictionnary
            cdEmbedding[clLine[0]] = np.array(clLine[1:], dtype=np.float32)  
    
    # dimension of the vectors
    iEmbeddingDim = cdEmbedding[list(cdEmbedding)[0]].shape[0]
    # Return dict mapping words to vectors and dimension of the vectors
    return cdEmbedding, iEmbeddingDim

In [None]:
# Load pre-trained starspace embeddings from tsv file
cdStarspaceEmbedding, iEmbeddingDim = LoadEmbeddings(f'{DATA_FOLDER}/StarSpace_embeddings.tsv')

In [None]:
# Load full stackOverflow posts, tagged with one programming language (positive samples)
dfStackOverflowPost = pd.read_csv(f'{DATA_FOLDER}/tagged_posts.tsv', sep='\t')

In [None]:
# Group posts by tag
dfCountByTag = dfStackOverflowPost.groupby(['tag'])['tag'].count()

In [None]:
# Create the directory thread_embeddings_by_tags
os.makedirs(f'{DATA_FOLDER}/thread_embeddings_by_tags', exist_ok=True)

# Loop over all tags and it's count
for sTag, iCount in dfCountByTag.items():
    # StackOverflow posts with the same tag
    dfStackOverflowTagPost = dfStackOverflowPost[dfStackOverflowPost['tag'] == sTag]
    # Post Ids of the tag
    caTagPostId = dfStackOverflowTagPost['post_id'].values 
    # a matrix where embeddings for each title are stored.
    caTagVector = np.zeros((iCount, iEmbeddingDim), dtype=np.float32)
    # Loop over all titles of all posts of the tag
    for iIndex, sTitle in enumerate(dfStackOverflowTagPost['title']):
        # Calculate title embedding
        caTagVector[iIndex, :] = QuestionEmbedding(sTitle, cdStarspaceEmbedding, iEmbeddingDim) 

    # Dump post ids and vectors to a file.
    sFilename = os.path.join(f'{DATA_FOLDER}/thread_embeddings_by_tags', os.path.normpath('%s.pkl' % sTag))
    pickle.dump((caTagPostId, caTagVector), open(sFilename, 'wb'))