In [1]:
    
import os
import re
import math
import random
import warnings

from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import gensim
from nltk.tokenize import word_tokenize
import dateutil.parser

import time
import pickle


%matplotlib inline


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jeffb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Note - See 11k Jupyter notebook for routines that create the combined data input files

####  create model files using the combined file as input in order to get answer data too

In [2]:
combined=pd.read_csv('data/stackoverflow/combined_all_ans.csv', keep_default_na=False, encoding='utf-8')
len(combined)

542216

In [3]:
# add a column for the number of images with this answer
# note that answers with no images have a num_of_images value of 2, representing the "[]" in the images_list
# for the emply image list.  So, answers with number_of_images > 2 have a image with them
combined['number_of_images'] = combined['images_list'].map(lambda x: len(x))

In [4]:
combined['number_of_images']

0           2
1           2
2          85
3           2
4           2
5           2
6           2
7           2
8           2
9           2
10          2
11          2
12          2
13          2
14          2
15          2
16          2
17          2
18          2
19          2
20          2
21          2
22          2
23          2
24          2
25          2
26          2
27          2
28          2
29          2
         ... 
542186      2
542187      2
542188      2
542189      2
542190      2
542191      2
542192      2
542193      2
542194      2
542195      2
542196      2
542197     85
542198      2
542199      2
542200    128
542201      2
542202     85
542203      2
542204      2
542205      2
542206     75
542207      2
542208      2
542209      2
542210      2
542211      2
542212      2
542213      2
542214      2
542215      2
Name: number_of_images, Length: 542216, dtype: int64

In [5]:
combined['images_list']

0                                                        []
1                                                        []
2         [<img alt="enter image description here" src="...
3                                                        []
4                                                        []
5                                                        []
6                                                        []
7                                                        []
8                                                        []
9                                                        []
10                                                       []
11                                                       []
12                                                       []
13                                                       []
14                                                       []
15                                                       []
16                                      

#### Create the various model files

In [42]:
# create  model using answer body + question tags

sttime=time.time()
raw_documents = combined['cleaned_body'] + ' ' + combined['new_tags']

print("Number of Combined Docs:",len(raw_documents))

    # Tokenizing data
gen_docs = [[w.lower() for w in word_tokenize(text)] 
                for text in raw_documents]

    # Create dictionary
dictionary = gensim.corpora.Dictionary(gen_docs)

    # Creat Document-Term Matrix
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]

    # Creat TF-IDF Model
tf_idf = gensim.models.TfidfModel(corpus)

    # Creat Similarity Checker
similar_docs = gensim.similarities.Similarity("",tf_idf[corpus],num_features=len(dictionary))

print("Answer + Tags Model Processing Completed! Elapsed time:", time.time()-sttime, "seconds")


Number of Combined Docs: 542216
Answer + Tags Model Processing Completed! Elapsed time: 1903.0342192649841 seconds


In [46]:
with open('data/stackoverflow/tf_idf_model.p', 'wb') as model_file:
    pickle.dump(tf_idf, model_file)

In [47]:
with open('data/stackoverflow/similar_qs.p', 'wb') as similar_docs_file:
    pickle.dump(similar_docs, similar_docs_file)

In [48]:
# create  model using answer body + question title + question tags

sttime=time.time()
raw_documents = combined['cleaned_body'] + ' ' + combined['title_y'] + ' ' + combined['new_tags']

print("Number of Combined Docs:",len(raw_documents))

    # Tokenizing data
gen_docs = [[w.lower() for w in word_tokenize(text)] 
                for text in raw_documents]

    # Create dictionary
dictionary = gensim.corpora.Dictionary(gen_docs)

    # Creat Document-Term Matrix
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]

    # Creat TF-IDF Model
tf_idf = gensim.models.TfidfModel(corpus)

    # Creat Similarity Checker
similar_docs = gensim.similarities.Similarity("",tf_idf[corpus],num_features=len(dictionary))

print("Answer + Question Title + Tags Model Processing Completed! Elapsed time:", time.time()-sttime, "seconds")


Number of Combined Docs: 542216
Answer + Question Title + Tags Model Processing Completed! Elapsed time: 669.8794641494751 seconds


In [52]:
with open('data/stackoverflow/tf_idf_model__ans_ques_title_tags.p', 'wb') as model_file:
    pickle.dump(tf_idf, model_file)

In [53]:
with open('data/stackoverflow/similar_qs__ans_ques_title_tags.p', 'wb') as similar_docs_file:
    pickle.dump(similar_docs, similar_docs_file)

In [58]:
# create  model using answer body + question tags using only answers that have images with them

sttime=time.time()

combinedimg=combined[combined['number_of_images'] > 2] # keep only answers with images

raw_documents = combinedimg['cleaned_body'] + ' ' + combinedimg['new_tags']

print("Number of Combined Docs:",len(raw_documents))

    # Tokenizing data
gen_docs = [[w.lower() for w in word_tokenize(text)] 
                for text in raw_documents]

    # Create dictionary
dictionary = gensim.corpora.Dictionary(gen_docs)

    # Creat Document-Term Matrix
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]

    # Creat TF-IDF Model
tf_idf = gensim.models.TfidfModel(corpus)

    # Creat Similarity Checker
similar_docs = gensim.similarities.Similarity("",tf_idf[corpus],num_features=len(dictionary))

print("Answer + Tags, Only Answers with Images Model Processing Completed! Elapsed time:", time.time()-sttime, "seconds")


Number of Combined Docs: 65369
Answer + Tags, Only Answers with Images Model Processing Completed! Elapsed time: 73.34753203392029 seconds


In [13]:
with open('data/stackoverflow/tf_idf_model__ans_with_imgs.p', 'wb') as model_file:
    pickle.dump(tf_idf, model_file)

In [14]:
with open('data/stackoverflow/similar_qs__ans_with_imgs.p', 'wb') as similar_docs_file:
    pickle.dump(similar_docs, similar_docs_file)

In [15]:
# create  model using question title + question tags

sttime=time.time()

combinedacc = combined[combined['id_x'] == pd.to_numeric(combined['accepted_answer_id_y'],downcast='integer')] # only include question with accepted answers
raw_documents = combinedacc['title_y'] + ' ' + combinedacc['new_tags']

print("Number of Combined Docs:",len(raw_documents))

    #Tokenizing data
gen_docs = [[w.lower() for w in word_tokenize(text)] 
                for text in raw_documents]

    # Create dictionary
dictionary = gensim.corpora.Dictionary(gen_docs)

    # Creat Document-Term Matrix
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]

    # Creat TF-IDF Model
tf_idf = gensim.models.TfidfModel(corpus)

    # Creat Similarity Checker
similar_docs = gensim.similarities.Similarity("",tf_idf[corpus],num_features=len(dictionary))

print("Questions with Accepted Answers + Tags Model Processing Completed! Elapsed time:", time.time()-sttime, "seconds")


Number of Combined Docs: 218898
Questions with Accepted Answers + Tags Model Processing Completed! Elapsed time: 185.6744019985199 seconds


In [16]:
with open('data/stackoverflow/tf_idf_model__ques.p', 'wb') as model_file:
    pickle.dump(tf_idf, model_file)

In [17]:
with open('data/stackoverflow/similar_qs__ques.p', 'wb') as similar_docs_file:
    pickle.dump(similar_docs, similar_docs_file)

#### Function to retrieve questions and answers with similarity scores above a user-define threshold based on pre-loaded model given a list of input queries


In [31]:
def similar_docs_combined_corpus(query_list,corpus,test_run,threshold,top_num_to_return):
    results = pd.DataFrame()
    for input_query in query_list:
        query_doc = [w.lower() for w in word_tokenize(input_query)]
        query_doc_bow = dictionary.doc2bow(query_doc)
        query_doc_tf_idf = tf_idf[query_doc_bow]
        doc_sim=similar_docs[query_doc_tf_idf]
        sim_threshold=threshold
        # Display similar questions from the past:

        print('len of corpus:',len(corpus))
        print('len of doc_sim:',len(doc_sim))
        corpus['Similarity']=doc_sim
        cmbdocs=corpus.sort_values('Similarity',ascending=False)
        # combdocs=cmbdocs.loc[:,:][cmbdocs['Similarity']>=sim_threshold]
        combdocs=cmbdocs[cmbdocs['Similarity']>=sim_threshold]
        if len(combdocs['cleaned_body']) < top_num_to_return:
            rslts_len=len(combdocs['cleaned_body'])
        else:
            rslts_len = top_num_to_return
        if rslts_len == 0:
            result = pd.DataFrame()
            result = result.append({'Corpus_Size':len(corpus), \
                                   'Test_Run':test_run, \
                                   'Input_query':input_query, \
                                   'Answer':' ', \
                                   'Related_Question':' ', \
                                   'Similarity_Score':' '}, ignore_index=True)
        else:   
            result = pd.DataFrame({'Corpus_Size':[len(corpus) for x in range(rslts_len)], \
                                   'Test_Run':[test_run for x in range(rslts_len)], \
                                   'Input_query':[input_query for x in range(rslts_len)], \
                                   'Answer':combdocs['cleaned_body'][0:rslts_len].tolist(), \
                                   'Related_Question':combdocs['title_y'][0:rslts_len].tolist(), \
                                   'Similarity_Score':combdocs['Similarity'][0:rslts_len]})
        results=results.append(result,ignore_index=True)
    return results

#### Load each model files and get the list of similar questions and answers.  Add the similar questions and answers to a combined result dataframe

In [19]:
# create combined result data frame to hold the results from all tests below

combrslts=pd.DataFrame()

In [20]:
# get the list of queries to run through each model

samp_ques = pd.read_csv('data/stackoverflow/Sample Questions V 2.csv', header=None,names=['ques'],encoding='utf-8')

In [21]:
Query_List=[x for x in samp_ques['ques']]

In [22]:
Query_List

['Is there a way to visualize the distribution of my data?',
 'How do I show data on a map?',
 'How can I illustrate changes in my data over time?',
 'Is there a way to show a "heatmap" of my data?',
 'How can I plot a comparison of two data sets?',
 'How can I create a chart without coding?',
 'When should I use a bar chart versus a pie chart?',
 'What is the easiest way to create a diagram of a network?',
 'I need help creating a visualization of my data',
 'I need help creating a graph of my data',
 'When should I use a scatter plot?',
 'How do I plot 2 datasets in d3?',
 'How can I animate a bar chart in Python?',
 'I know how to create a line chart with matplotlib, how do I do it in R?',
 'What is the easiest way to create a heat map of the US?',
 'How can I animate a choropleth in Tableau?',
 'How can I animate a choropleth in PowerBI?',
 'How can I animate a choropleth in d3?']

In [23]:
# load the question title + tag models
sttime=time.time()
with open('data/stackoverflow/tf_idf_model__ques.p', 'rb') as model_file:
    tf_idf = pickle.load(model_file)
print('load time for TF-IDF model for questions plus question tags:',time.time()-sttime,'seconds')

load time for TF-IDF model for questions plus question tags: 0.1578214168548584 seconds


In [24]:
sttime=time.time()
with open('data/stackoverflow/similar_qs__ques.p', 'rb') as similar_qs_file:
    similar_docs = pickle.load(similar_qs_file)
print('load time for similar_docs for questions plus question tags:',time.time()-sttime,'seconds')

load time for similar_docs for questions plus question tags: 0.5119781494140625 seconds


In [25]:
test_run='TF-IDF on questions plus question tags'

results = similar_docs_combined_corpus(Query_List,combinedacc,test_run,0.50,2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  sort=sort)


In [26]:
results

Unnamed: 0,Answer,Corpus_Size,Input_query,Related_Question,Similarity_Score,Test_Run
0,,218898.0,Is there a way to visualize the distribution o...,,,TF-IDF on questions plus question tags
1,You might want to have a look at the leaflet ...,218898.0,How do I show data on a map?,display data on a map,0.538224,TF-IDF on questions plus question tags
2,Here's an example:,218898.0,How do I show data on a map?,How to plot data on a map without using Google...,0.531061,TF-IDF on questions plus question tags
3,What I would do to make this happen is to sim...,218898.0,How can I illustrate changes in my data over t...,How to illustrate angular data series in highc...,0.518669,TF-IDF on questions plus question tags
4,,218898.0,"Is there a way to show a ""heatmap"" of my data?",,,TF-IDF on questions plus question tags
5,You could use to get a scatter plot. You woul...,218898.0,How can I plot a comparison of two data sets?,Scatter plot with two data sets,0.543256,TF-IDF on questions plus question tags
6,"If you only supply one variable, then assumes...",218898.0,How can I plot a comparison of two data sets?,Plotting two variables for comparison,0.542769,TF-IDF on questions plus question tags
7,"is just a list of colors, so use the keyword ...",218898.0,How can I create a chart without coding?,ColorMap Coding,0.535853,TF-IDF on questions plus question tags
8,"Welcome J Cheong Edit Sorry, I'm using Matlab...",218898.0,How can I create a chart without coding?,Octave Coding - I need help coding coefficient...,0.533574,TF-IDF on questions plus question tags
9,,218898.0,When should I use a bar chart versus a pie chart?,,,TF-IDF on questions plus question tags


In [27]:
# add results to combined results dataframe
combrslts=combrslts.append(results,ignore_index=True)

In [32]:
# load the answer body + question tags model
sttime=time.time()

with open('data/stackoverflow/tf_idf_model.p', 'rb') as model_file:
    tf_idf = pickle.load(model_file)

print('load time for TF-IDF model for answers plus question tags:',time.time()-sttime,'seconds')

load time for TF-IDF model for answers plus question tags: 0.5867733955383301 seconds


In [33]:
sttime=time.time()

with open('data/stackoverflow/similar_qs.p', 'rb') as similar_qs_file:
    similar_docs = pickle.load(similar_qs_file)

print('load time for similar_docs for answers plus question tags:',time.time()-sttime,'seconds')

load time for similar_docs for answers plus question tags: 0.5062260627746582 seconds


In [43]:
test_run='TF-IDF on answers plus question tags'

results = similar_docs_combined_corpus(Query_List,combined,test_run,0.50,2)

len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  sort=sort)


len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216


In [44]:
results

Unnamed: 0,Answer,Corpus_Size,Input_query,Related_Question,Similarity_Score,Test_Run
0,,542216.0,Is there a way to visualize the distribution o...,,,TF-IDF on answers plus question tags
1,Map to :,542216.0,How do I show data on a map?,Adding multiple colors to a network in R,0.561741,TF-IDF on answers plus question tags
2,Use map .,542216.0,How do I show data on a map?,"Angular-Charts, Map char-Labels to Property of...",0.532434,TF-IDF on answers plus question tags
3,To illustrate my comment:,542216.0,How can I illustrate changes in my data over t...,Grouping Regressors in Anova Table for Multipl...,0.58465,TF-IDF on answers plus question tags
4,How about a heatmap? Plot the heatmap with de...,542216.0,"Is there a way to show a ""heatmap"" of my data?",How coloring data frame in R,0.590963,TF-IDF on answers plus question tags
5,Here it is better to use (heatmap).,542216.0,"Is there a way to show a ""heatmap"" of my data?",row column heatmap plot with overlayed circle ...,0.5587,TF-IDF on answers plus question tags
6,,542216.0,How can I plot a comparison of two data sets?,,,TF-IDF on answers plus question tags
7,Hope this coding will helps you :),542216.0,How can I create a chart without coding?,How to specify the actual x axis values to plo...,0.504976,TF-IDF on answers plus question tags
8,versus,542216.0,When should I use a bar chart versus a pie chart?,How to overplot a line on a scatter plot in py...,0.560095,TF-IDF on answers plus question tags
9,use following for Pie chart:,542216.0,When should I use a bar chart versus a pie chart?,Chartjs pie chart tooltip mode label,0.504397,TF-IDF on answers plus question tags


In [45]:
# add results to combined results dataframe
combrslts=combrslts.append(results,ignore_index=True)

In [35]:
# load the answer body + question title + question tags model
sttime=time.time()

with open('data/stackoverflow/tf_idf_model__ans_ques_title_tags.p', 'rb') as model_file:
    tf_idf = pickle.load(model_file)
    
print('load time for TF-IDF model for answers plus questions plus question tags:',time.time()-sttime,'seconds')

load time for TF-IDF model for answers plus questions plus question tags: 0.6238059997558594 seconds


In [36]:
sttime=time.time()

with open('data/stackoverflow/similar_qs__ans_ques_title_tags.p', 'rb') as similar_qs_file:
    similar_docs = pickle.load(similar_qs_file)
    
print('load time for similar_docs for answers plus questions plus question tags:',time.time()-sttime,'seconds')

load time for similar_docs for answers plus questions plus question tags: 0.8508121967315674 seconds


In [49]:
test_run='TF-IDF on answers plus question titles + question tags'
sttime=time.time()
results = similar_docs_combined_corpus(Query_List,combined,test_run,0.50,2)
print('response time for 18 queries:',time.time()-sttime,'seconds')

len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  sort=sort)


len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
len of corpus: 542216
len of doc_sim: 542216
response time for 18 queries: 39.33021545410156 seconds


In [50]:
results

Unnamed: 0,Answer,Corpus_Size,Input_query,Related_Question,Similarity_Score,Test_Run
0,You could use boxplots to visualize the distr...,542216.0,Is there a way to visualize the distribution o...,Plotting distribution of differences in R,0.544921,TF-IDF on answers plus question titles + quest...
1,Here's an example:,542216.0,How do I show data on a map?,How to plot data on a map without using Google...,0.567264,TF-IDF on answers plus question titles + quest...
2,You are not providing the map data to your ch...,542216.0,How do I show data on a map?,Highmaps chart is empty in Angular 5,0.509525,TF-IDF on answers plus question titles + quest...
3,,542216.0,How can I illustrate changes in my data over t...,,,TF-IDF on answers plus question titles + quest...
4,How about a heatmap? Plot the heatmap with de...,542216.0,"Is there a way to show a ""heatmap"" of my data?",How coloring data frame in R,0.531121,TF-IDF on answers plus question titles + quest...
5,You could use seaborn instead of matplotlib f...,542216.0,"Is there a way to show a ""heatmap"" of my data?",How to plot data dependent on two variables in...,0.519752,TF-IDF on answers plus question titles + quest...
6,,542216.0,How can I plot a comparison of two data sets?,,,TF-IDF on answers plus question titles + quest...
7,,542216.0,How can I create a chart without coding?,,,TF-IDF on answers plus question titles + quest...
8,,542216.0,When should I use a bar chart versus a pie chart?,,,TF-IDF on answers plus question titles + quest...
9,,542216.0,What is the easiest way to create a diagram of...,,,TF-IDF on answers plus question titles + quest...


In [51]:
# add results to combined results dataframe
combrslts=combrslts.append(results,ignore_index=True)

In [54]:
# load the answer body + question tags, only answers with images model

with open('data/stackoverflow/tf_idf_model__ans_with_imgs.p', 'rb') as model_file:
    tf_idf = pickle.load(model_file)

In [55]:
with open('data/stackoverflow/similar_qs__ans_with_imgs.p', 'rb') as similar_qs_file:
    similar_docs = pickle.load(similar_qs_file)

In [59]:
test_run='TF-IDF on answers plus question tags, only answers with images'

results = similar_docs_combined_corpus(Query_List,combinedimg,test_run,0.50,2)

len of corpus: 65369
len of doc_sim: 65369


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


len of corpus: 65369
len of doc_sim: 65369
len of corpus: 65369
len of doc_sim: 65369
len of corpus: 65369
len of doc_sim: 65369


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  sort=sort)


len of corpus: 65369
len of doc_sim: 65369
len of corpus: 65369
len of doc_sim: 65369
len of corpus: 65369
len of doc_sim: 65369
len of corpus: 65369
len of doc_sim: 65369
len of corpus: 65369
len of doc_sim: 65369
len of corpus: 65369
len of doc_sim: 65369
len of corpus: 65369
len of doc_sim: 65369
len of corpus: 65369
len of doc_sim: 65369
len of corpus: 65369
len of doc_sim: 65369
len of corpus: 65369
len of doc_sim: 65369
len of corpus: 65369
len of doc_sim: 65369
len of corpus: 65369
len of doc_sim: 65369
len of corpus: 65369
len of doc_sim: 65369
len of corpus: 65369
len of doc_sim: 65369


In [60]:
results

Unnamed: 0,Answer,Corpus_Size,Input_query,Related_Question,Similarity_Score,Test_Run
0,,65369.0,Is there a way to visualize the distribution o...,,,"TF-IDF on answers plus question tags, only ans..."
1,,65369.0,How do I show data on a map?,,,"TF-IDF on answers plus question tags, only ans..."
2,,65369.0,How can I illustrate changes in my data over t...,,,"TF-IDF on answers plus question tags, only ans..."
3,How about a heatmap? Plot the heatmap with de...,65369.0,"Is there a way to show a ""heatmap"" of my data?",How coloring data frame in R,0.526814,"TF-IDF on answers plus question tags, only ans..."
4,Here it is better to use (heatmap).,65369.0,"Is there a way to show a ""heatmap"" of my data?",row column heatmap plot with overlayed circle ...,0.513608,"TF-IDF on answers plus question tags, only ans..."
5,,65369.0,How can I plot a comparison of two data sets?,,,"TF-IDF on answers plus question tags, only ans..."
6,,65369.0,How can I create a chart without coding?,,,"TF-IDF on answers plus question tags, only ans..."
7,,65369.0,When should I use a bar chart versus a pie chart?,,,"TF-IDF on answers plus question tags, only ans..."
8,,65369.0,What is the easiest way to create a diagram of...,,,"TF-IDF on answers plus question tags, only ans..."
9,,65369.0,I need help creating a visualization of my data,,,"TF-IDF on answers plus question tags, only ans..."


In [61]:
# add results to combined results dataframe
combrslts=combrslts.append(results,ignore_index=True)

In [62]:
combrslts

Unnamed: 0,Answer,Corpus_Size,Input_query,Related_Question,Similarity_Score,Test_Run
0,,218898.0,Is there a way to visualize the distribution o...,,,TF-IDF on questions plus question tags
1,You might want to have a look at the leaflet ...,218898.0,How do I show data on a map?,display data on a map,0.538224,TF-IDF on questions plus question tags
2,Here's an example:,218898.0,How do I show data on a map?,How to plot data on a map without using Google...,0.531061,TF-IDF on questions plus question tags
3,What I would do to make this happen is to sim...,218898.0,How can I illustrate changes in my data over t...,How to illustrate angular data series in highc...,0.518669,TF-IDF on questions plus question tags
4,,218898.0,"Is there a way to show a ""heatmap"" of my data?",,,TF-IDF on questions plus question tags
5,You could use to get a scatter plot. You woul...,218898.0,How can I plot a comparison of two data sets?,Scatter plot with two data sets,0.543256,TF-IDF on questions plus question tags
6,"If you only supply one variable, then assumes...",218898.0,How can I plot a comparison of two data sets?,Plotting two variables for comparison,0.542769,TF-IDF on questions plus question tags
7,"is just a list of colors, so use the keyword ...",218898.0,How can I create a chart without coding?,ColorMap Coding,0.535853,TF-IDF on questions plus question tags
8,"Welcome J Cheong Edit Sorry, I'm using Matlab...",218898.0,How can I create a chart without coding?,Octave Coding - I need help coding coefficient...,0.533574,TF-IDF on questions plus question tags
9,,218898.0,When should I use a bar chart versus a pie chart?,,,TF-IDF on questions plus question tags


In [63]:
# write the combined results file to a csv
combrslts.to_csv('data/stackoverflow/combined_test_run_results.csv', index=False)