In [2]:
    
import os
import re
import math
import random
import warnings

from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import gensim
from nltk.tokenize import word_tokenize
import dateutil.parser

import time
import pickle


%matplotlib inline


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jeffb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Note - See 11k Jupyter notebook for routines that create the combined data input files

####  create model files using the combined file as input in order to get answer data too

In [3]:
combined22k=pd.read_csv('data/stackoverflow/combined_all_ans_22k.csv', keep_default_na=False, encoding='utf-8')
len(combined22k)

28038

In [4]:
# add a column for the number of images with this answer
# note that answers with no images have a num_of_images value of 2, representing the "[]" in the images_list
# for the emply image list.  So, answers with number_of_images > 2 have a image with them
combined22k['number_of_images'] = combined22k['images_list'].map(lambda x: len(x))

In [5]:
combined22k['number_of_images']

0          2
1          2
2          2
3         85
4          2
5          2
6          2
7          2
8        179
9          2
10         2
11         2
12         2
13         2
14         2
15         2
16         2
17         2
18         2
19         2
20         2
21         2
22         2
23         2
24         2
25         2
26         2
27         2
28         2
29         2
        ... 
28008      2
28009      2
28010      2
28011      2
28012      2
28013    170
28014      2
28015      2
28016      2
28017     67
28018      2
28019      2
28020      2
28021     87
28022      2
28023      2
28024      2
28025      2
28026    255
28027      2
28028     86
28029     85
28030      2
28031      2
28032      2
28033      2
28034    170
28035      2
28036      2
28037      2
Name: number_of_images, Length: 28038, dtype: int64

In [6]:
combined22k['images_list']

0                                                       []
1                                                       []
2                                                       []
3        [<img alt="enter image description here" src="...
4                                                       []
5                                                       []
6                                                       []
7                                                       []
8        [<img alt="Output when using GPC. " src="https...
9                                                       []
10                                                      []
11                                                      []
12                                                      []
13                                                      []
14                                                      []
15                                                      []
16                                                      

#### Create the various model files

In [7]:
# create 22k model using answer body + question tags

sttime=time.time()
raw_documents = combined22k['cleaned_body'] + ' ' + combined22k['new_tags']

print("Number of Combined Docs:",len(raw_documents))

    # Tokenizing data
gen_docs = [[w.lower() for w in word_tokenize(text)] 
                for text in raw_documents]

# gen_docs = [word_tokenize(text) 
#                for text in raw_documents]


    # Create dictionary
dictionary = gensim.corpora.Dictionary(gen_docs)

    # Creat Document-Term Matrix
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]

    # Creat TF-IDF Model
tf_idf = gensim.models.TfidfModel(corpus)

    # Creat Similarity Checker
similar_docs = gensim.similarities.Similarity("",tf_idf[corpus],num_features=len(dictionary))

print("22k Answer + Tags Model Processing Completed! Elapsed time:", time.time()-sttime, "seconds")


Number of Combined Docs: 28038
22k Answer + Tags Model Processing Completed! Elapsed time: 71.39153385162354 seconds


In [8]:
with open('data/stackoverflow/tf_idf_model_22k.p', 'wb') as model_file:
    pickle.dump(tf_idf, model_file)

In [9]:
with open('data/stackoverflow/similar_qs_22k.p', 'wb') as similar_docs_file:
    pickle.dump(similar_docs, similar_docs_file)

In [10]:
# create 22k model using answer body + question title + question tags

sttime=time.time()
raw_documents = combined22k['cleaned_body'] + ' ' + combined22k['title_y'] + ' ' + combined22k['new_tags']

print("Number of Combined Docs:",len(raw_documents))

    # Tokenizing data
gen_docs = [[w.lower() for w in word_tokenize(text)] 
                for text in raw_documents]

    # Create dictionary
dictionary = gensim.corpora.Dictionary(gen_docs)

    # Creat Document-Term Matrix
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]

    # Creat TF-IDF Model
tf_idf = gensim.models.TfidfModel(corpus)

    # Creat Similarity Checker
similar_docs = gensim.similarities.Similarity("",tf_idf[corpus],num_features=len(dictionary))

print("22k Answer + Question Title + Tags Model Processing Completed! Elapsed time:", time.time()-sttime, "seconds")


Number of Combined Docs: 28038
22k Answer + Question Title + Tags Model Processing Completed! Elapsed time: 74.86645030975342 seconds


In [11]:
with open('data/stackoverflow/tf_idf_model_22k_ans_ques_title_tags.p', 'wb') as model_file:
    pickle.dump(tf_idf, model_file)

In [12]:
with open('data/stackoverflow/similar_qs_22k_ans_ques_title_tags.p', 'wb') as similar_docs_file:
    pickle.dump(similar_docs, similar_docs_file)

In [13]:
# create 22k model using answer body + question tags using only answers that have images with them

sttime=time.time()

combined22kimg=combined22k[combined22k['number_of_images'] > 2] # keep only answers with images

raw_documents = combined22kimg['cleaned_body'] + ' ' + combined22kimg['new_tags']

print("Number of Combined Docs:",len(raw_documents))

    # Tokenizing data
gen_docs = [[w.lower() for w in word_tokenize(text)] 
                for text in raw_documents]

    # Create dictionary
dictionary = gensim.corpora.Dictionary(gen_docs)

    # Creat Document-Term Matrix
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]

    # Creat TF-IDF Model
tf_idf = gensim.models.TfidfModel(corpus)

    # Creat Similarity Checker
similar_docs = gensim.similarities.Similarity("",tf_idf[corpus],num_features=len(dictionary))

print("22k Answer + Tags, Only Answers with Images Model Processing Completed! Elapsed time:", time.time()-sttime, "seconds")


Number of Combined Docs: 3392
22k Answer + Tags, Only Answers with Images Model Processing Completed! Elapsed time: 9.371024131774902 seconds


In [14]:
with open('data/stackoverflow/tf_idf_model_22k_ans_with_imgs.p', 'wb') as model_file:
    pickle.dump(tf_idf, model_file)

In [15]:
with open('data/stackoverflow/similar_qs_22k_ans_with_imgs.p', 'wb') as similar_docs_file:
    pickle.dump(similar_docs, similar_docs_file)

In [16]:
# create 22k model using question title + question tags

sttime=time.time()

combined22kacc = combined22k[combined22k['id_x'] == pd.to_numeric(combined22k['accepted_answer_id_y'],downcast='integer')] # only include question with accepted answers
raw_documents = combined22kacc['title_y'] + ' ' + combined22kacc['new_tags']

print("Number of Combined Docs:",len(raw_documents))

    #Tokenizing data
gen_docs = [[w.lower() for w in word_tokenize(text)] 
                for text in raw_documents]

    # Create dictionary
dictionary = gensim.corpora.Dictionary(gen_docs)

    # Creat Document-Term Matrix
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]

    # Creat TF-IDF Model
tf_idf = gensim.models.TfidfModel(corpus)

    # Creat Similarity Checker
similar_docs = gensim.similarities.Similarity("",tf_idf[corpus],num_features=len(dictionary))

print("22k Questions with Accepted Answers + Tags Model Processing Completed! Elapsed time:", time.time()-sttime, "seconds")


Number of Combined Docs: 11235
22k Questions with Accepted Answers + Tags Model Processing Completed! Elapsed time: 8.974916219711304 seconds


In [17]:
with open('data/stackoverflow/tf_idf_model_22k_ques.p', 'wb') as model_file:
    pickle.dump(tf_idf, model_file)

In [18]:
with open('data/stackoverflow/similar_qs_22k_ques.p', 'wb') as similar_docs_file:
    pickle.dump(similar_docs, similar_docs_file)

#### Function to retrieve questions and answers with similarity scores above a user-define threshold based on pre-loaded model given a list of input queries


In [19]:
def similar_docs_combined_corpus(query_list,corpus,test_run,threshold,top_num_to_return):
    results = pd.DataFrame()
    for input_query in query_list:
        query_doc = [w.lower() for w in word_tokenize(input_query)]
        query_doc_bow = dictionary.doc2bow(query_doc)
        query_doc_tf_idf = tf_idf[query_doc_bow]
        doc_sim=similar_docs[query_doc_tf_idf]
        sim_threshold=threshold
        # Display similar questions from the past:

        corpus['Similarity']=doc_sim
        cmbdocs=corpus.sort_values('Similarity',ascending=False)
        # combdocs=cmbdocs.loc[:,:][cmbdocs['Similarity']>=sim_threshold]
        combdocs=cmbdocs[cmbdocs['Similarity']>=sim_threshold]
        if len(combdocs['cleaned_body']) < top_num_to_return:
            rslts_len=len(combdocs['cleaned_body'])
        else:
            rslts_len = top_num_to_return
        if rslts_len == 0:
            result = pd.DataFrame()
            result = result.append({'Corpus_Size':len(corpus), \
                                   'Test_Run':test_run, \
                                   'Input_query':input_query, \
                                   'Answer':' ', \
                                   'Related_Question':' ', \
                                   'Similarity_Score':' '}, ignore_index=True)
        else:   
            result = pd.DataFrame({'Corpus_Size':[len(corpus) for x in range(rslts_len)], \
                                   'Test_Run':[test_run for x in range(rslts_len)], \
                                   'Input_query':[input_query for x in range(rslts_len)], \
                                   'Answer':combdocs['cleaned_body'][0:rslts_len].tolist(), \
                                   'Related_Question':combdocs['title_y'][0:rslts_len].tolist(), \
                                   'Similarity_Score':combdocs['Similarity'][0:rslts_len]})
        results=results.append(result,ignore_index=True)
    return results

#### Load each model files and get the list of similar questions and answers.  Add the similar questions and answers to a combined result dataframe

In [20]:
# create combined result data frame to hold the results from all tests below

combrslts=pd.DataFrame()

In [21]:
# get the list of queries to run through each model

samp_ques = pd.read_csv('data/stackoverflow/Sample Questions V 2.csv', header=None,names=['ques'],encoding='utf-8')

In [22]:
Query_List=[x for x in samp_ques['ques']]

In [23]:
Query_List

['Is there a way to visualize the distribution of my data?',
 'How do I show data on a map?',
 'How can I illustrate changes in my data over time?',
 'Is there a way to show a "heatmap" of my data?',
 'How can I plot a comparison of two data sets?',
 'How can I create a chart without coding?',
 'When should I use a bar chart versus a pie chart?',
 'What is the easiest way to create a diagram of a network?',
 'I need help creating a visualization of my data',
 'I need help creating a graph of my data',
 'When should I use a scatter plot?',
 'How do I plot 2 datasets in d3?',
 'How can I animate a bar chart in Python?',
 'I know how to create a line chart with matplotlib, how do I do it in R?',
 'What is the easiest way to create a heat map of the US?',
 'How can I animate a choropleth in Tableau?',
 'How can I animate a choropleth in PowerBI?',
 'How can I animate a choropleth in d3?']

In [24]:
# load the question title + tag models

with open('data/stackoverflow/tf_idf_model_22k_ques.p', 'rb') as model_file:
    tf_idf = pickle.load(model_file)

In [25]:
with open('data/stackoverflow/similar_qs_22k_ques.p', 'rb') as similar_qs_file:
    similar_docs = pickle.load(similar_qs_file)

In [26]:
test_run='TF-IDF on questions plus question tags'

results = similar_docs_combined_corpus(Query_List,combined22kacc,test_run,0.50,2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  sort=sort)


In [27]:
results

Unnamed: 0,Answer,Corpus_Size,Input_query,Related_Question,Similarity_Score,Test_Run
0,,11235.0,Is there a way to visualize the distribution o...,,,TF-IDF on questions plus question tags
1,Here's an example:,11235.0,How do I show data on a map?,How to plot data on a map without using Google...,0.527034,TF-IDF on questions plus question tags
2,You can change the labels on the x-axis using...,11235.0,How can I illustrate changes in my data over t...,plotting changes over time in python/matplotlib,0.559225,TF-IDF on questions plus question tags
3,,11235.0,"Is there a way to show a ""heatmap"" of my data?",,,TF-IDF on questions plus question tags
4,I expect you are looking for something like t...,11235.0,How can I plot a comparison of two data sets?,Comparison plots,0.507878,TF-IDF on questions plus question tags
5,Sure. Just start from an empty plot and then...,11235.0,How can I create a chart without coding?,How can I create a legend without a plot in R?,0.51222,TF-IDF on questions plus question tags
6,,11235.0,When should I use a bar chart versus a pie chart?,,,TF-IDF on questions plus question tags
7,,11235.0,What is the easiest way to create a diagram of...,,,TF-IDF on questions plus question tags
8,From Google it would appear that this approac...,11235.0,I need help creating a visualization of my data,Need help creating a highchart histogram in a ...,0.505915,TF-IDF on questions plus question tags
9,From Google it would appear that this approac...,11235.0,I need help creating a graph of my data,Need help creating a highchart histogram in a ...,0.523505,TF-IDF on questions plus question tags


In [28]:
# add results to combined results dataframe
combrslts=combrslts.append(results,ignore_index=True)

In [29]:
# load the answer body + question tags model

with open('data/stackoverflow/tf_idf_model_22k.p', 'rb') as model_file:
    tf_idf = pickle.load(model_file)

In [30]:
with open('data/stackoverflow/similar_qs_22k.p', 'rb') as similar_qs_file:
    similar_docs = pickle.load(similar_qs_file)

In [31]:
test_run='TF-IDF on answers plus question tags'

results = similar_docs_combined_corpus(Query_List,combined22k,test_run,0.50,2)

In [32]:
results

Unnamed: 0,Answer,Corpus_Size,Input_query,Related_Question,Similarity_Score,Test_Run
0,,28038.0,Is there a way to visualize the distribution o...,,,TF-IDF on answers plus question tags
1,,28038.0,How do I show data on a map?,,,TF-IDF on answers plus question tags
2,,28038.0,How can I illustrate changes in my data over t...,,,TF-IDF on answers plus question tags
3,,28038.0,"Is there a way to show a ""heatmap"" of my data?",,,TF-IDF on answers plus question tags
4,,28038.0,How can I plot a comparison of two data sets?,,,TF-IDF on answers plus question tags
5,,28038.0,How can I create a chart without coding?,,,TF-IDF on answers plus question tags
6,,28038.0,When should I use a bar chart versus a pie chart?,,,TF-IDF on answers plus question tags
7,,28038.0,What is the easiest way to create a diagram of...,,,TF-IDF on answers plus question tags
8,,28038.0,I need help creating a visualization of my data,,,TF-IDF on answers plus question tags
9,,28038.0,I need help creating a graph of my data,,,TF-IDF on answers plus question tags


In [33]:
# add results to combined results dataframe
combrslts=combrslts.append(results,ignore_index=True)

In [34]:
# load the answer body + question title + question tags model

with open('data/stackoverflow/tf_idf_model_22k_ans_ques_title_tags.p', 'rb') as model_file:
    tf_idf = pickle.load(model_file)

In [35]:
with open('data/stackoverflow/similar_qs_22k_ans_ques_title_tags.p', 'rb') as similar_qs_file:
    similar_docs = pickle.load(similar_qs_file)

In [36]:
test_run='TF-IDF on answers plus question titles + question tags'
sttime=time.time()
results = similar_docs_combined_corpus(Query_List,combined22k,test_run,0.50,2)
print('response time for 18 queries:',time.time()-sttime,'seconds')

response time for 18 queries: 9.509452104568481 seconds


In [37]:
results

Unnamed: 0,Answer,Corpus_Size,Input_query,Related_Question,Similarity_Score,Test_Run
0,,28038.0,Is there a way to visualize the distribution o...,,,TF-IDF on answers plus question titles + quest...
1,,28038.0,How do I show data on a map?,,,TF-IDF on answers plus question titles + quest...
2,,28038.0,How can I illustrate changes in my data over t...,,,TF-IDF on answers plus question titles + quest...
3,,28038.0,"Is there a way to show a ""heatmap"" of my data?",,,TF-IDF on answers plus question titles + quest...
4,,28038.0,How can I plot a comparison of two data sets?,,,TF-IDF on answers plus question titles + quest...
5,,28038.0,How can I create a chart without coding?,,,TF-IDF on answers plus question titles + quest...
6,,28038.0,When should I use a bar chart versus a pie chart?,,,TF-IDF on answers plus question titles + quest...
7,,28038.0,What is the easiest way to create a diagram of...,,,TF-IDF on answers plus question titles + quest...
8,,28038.0,I need help creating a visualization of my data,,,TF-IDF on answers plus question titles + quest...
9,,28038.0,I need help creating a graph of my data,,,TF-IDF on answers plus question titles + quest...


In [38]:
# add results to combined results dataframe
combrslts=combrslts.append(results,ignore_index=True)

In [39]:
# load the answer body + question tags, only answers with images model

with open('data/stackoverflow/tf_idf_model_22k_ans_with_imgs.p', 'rb') as model_file:
    tf_idf = pickle.load(model_file)

In [40]:
with open('data/stackoverflow/similar_qs_22k_ans_with_imgs.p', 'rb') as similar_qs_file:
    similar_docs = pickle.load(similar_qs_file)

In [41]:
test_run='TF-IDF on answers plus question tags, only answers with images'

results = similar_docs_combined_corpus(Query_List,combined22kimg,test_run,0.50,2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [42]:
results

Unnamed: 0,Answer,Corpus_Size,Input_query,Related_Question,Similarity_Score,Test_Run
0,,3392.0,Is there a way to visualize the distribution o...,,,"TF-IDF on answers plus question tags, only ans..."
1,,3392.0,How do I show data on a map?,,,"TF-IDF on answers plus question tags, only ans..."
2,,3392.0,How can I illustrate changes in my data over t...,,,"TF-IDF on answers plus question tags, only ans..."
3,,3392.0,"Is there a way to show a ""heatmap"" of my data?",,,"TF-IDF on answers plus question tags, only ans..."
4,,3392.0,How can I plot a comparison of two data sets?,,,"TF-IDF on answers plus question tags, only ans..."
5,,3392.0,How can I create a chart without coding?,,,"TF-IDF on answers plus question tags, only ans..."
6,,3392.0,When should I use a bar chart versus a pie chart?,,,"TF-IDF on answers plus question tags, only ans..."
7,,3392.0,What is the easiest way to create a diagram of...,,,"TF-IDF on answers plus question tags, only ans..."
8,,3392.0,I need help creating a visualization of my data,,,"TF-IDF on answers plus question tags, only ans..."
9,,3392.0,I need help creating a graph of my data,,,"TF-IDF on answers plus question tags, only ans..."


In [43]:
# add results to combined results dataframe
combrslts=combrslts.append(results,ignore_index=True)

In [44]:
combrslts

Unnamed: 0,Answer,Corpus_Size,Input_query,Related_Question,Similarity_Score,Test_Run
0,,11235.0,Is there a way to visualize the distribution o...,,,TF-IDF on questions plus question tags
1,Here's an example:,11235.0,How do I show data on a map?,How to plot data on a map without using Google...,0.527034,TF-IDF on questions plus question tags
2,You can change the labels on the x-axis using...,11235.0,How can I illustrate changes in my data over t...,plotting changes over time in python/matplotlib,0.559225,TF-IDF on questions plus question tags
3,,11235.0,"Is there a way to show a ""heatmap"" of my data?",,,TF-IDF on questions plus question tags
4,I expect you are looking for something like t...,11235.0,How can I plot a comparison of two data sets?,Comparison plots,0.507878,TF-IDF on questions plus question tags
5,Sure. Just start from an empty plot and then...,11235.0,How can I create a chart without coding?,How can I create a legend without a plot in R?,0.51222,TF-IDF on questions plus question tags
6,,11235.0,When should I use a bar chart versus a pie chart?,,,TF-IDF on questions plus question tags
7,,11235.0,What is the easiest way to create a diagram of...,,,TF-IDF on questions plus question tags
8,From Google it would appear that this approac...,11235.0,I need help creating a visualization of my data,Need help creating a highchart histogram in a ...,0.505915,TF-IDF on questions plus question tags
9,From Google it would appear that this approac...,11235.0,I need help creating a graph of my data,Need help creating a highchart histogram in a ...,0.523505,TF-IDF on questions plus question tags


In [45]:
# write the combined results file to a csv
combrslts.to_csv('data/stackoverflow/combined_test_run_results_22k.csv', index=False)