In [11]:
import sys
sys.path.append('../')
sys.path.append('../src/')

import pandas
import scipy
import numpy as np
from pprint import pprint

from gensim.models.phrases import *

import gensim.models as models
import gensim.models.word2vec as word2vec
import secure

import searchbetter.search as search
reload(search)
import searchbetter.rewriter as rewriter
reload(rewriter)
import analysis.plots as plots
reload(plots)
import analysis.stats as stats
reload(stats)

import plotly
import plotly.graph_objs as go
import plotly.offline as py

import webcolors

py.init_notebook_mode()


# definitions and stuff

# colors plotly uses

colors = [
    '#1f77b4', # blue
    '#ff7f0e', # orange
    '#2ca02c', # green
    '#d62728', # red
    '#9467bd'  # purple
]

# need to convert, e.g., #FF0000 to 'rgb(255,0,0)'
rgb_colors = [webcolors.hex_to_rgb(color) for color in colors]
color_strings = ['rgb(%s,%s,%s)' % (c[0], c[1], c[2]) for c in rgb_colors]

ImportError: No module named search

In [12]:
df.to_csv('../tmp/queries.csv')

In [4]:
model_path = secure.MODEL_PATH_BASE+'word2vec/word2vec'
w2v_rewriter = rewriter.Word2VecRewriter(model_path, create=False)

# Create a search engine that searches over all edX courses.
# Under the hood, this uses Python's Whoosh library to index
# the course data stored in a CSV and then run searches against it.
dataset_path = secure.DATASET_PATH_BASE+'Master CourseListings - edX.csv'
index_path = secure.INDEX_PATH_BASE+'edx'
edx_engine = search.EdXSearchEngine(dataset_path, index_path, create=False)

# comparison testing

rewriters = [
    rewriter.ControlRewriter(),
    rewriter.WikipediaRewriter(),
    w2v_rewriter
]

In [5]:
def term_stats(term):
    ans = [num_results(term, rw) for rw in rewriters]
    ans = [term] + ans
    return ans
    
    
def num_results(term, rw):
    edx_engine.set_rewriter(rw)
    results = edx_engine.search(term)
    num_results = len(results)
    return num_results


with open('../test/test-search-terms/generic.txt', 'r') as f:
    # read terms but chop the newlines at the end of each line
    terms = [line.rstrip('\n') for line in f]
    data = [term_stats(term) for term in terms]

df = pandas.DataFrame(columns=["term","control","wiki","word2vec"], data=data)

In [7]:
df

Unnamed: 0,term,control,wiki,word2vec
0,Abbe number,0,3,3
1,Abscisic acid,0,0,0
2,Abscission,0,0,0
3,Absolute advantage,0,0,0
4,Absolute electrode potential,0,0,0
5,Absolute humidity,0,0,0
6,Absolute magnitude,0,0,0
7,Absolute motion,0,0,0
8,Absolute pressure,0,0,0
9,Absolute scale,0,1,0


In [None]:
control_hits = list(df['control'])
wiki_hits = list(df['wiki'])
w2v_hits = list(df['word2vec'])

In [None]:
# reference: https://plot.ly/python/reference/#scattergl

traceControl = go.Scattergl(
    x = control_hits,
    y = control_hits,
    mode = 'lines',
    name = 'Control (no rewriting)',
    hoverinfo = 'text+name',
    line = dict(
        color = color_strings[0]
    )
)

# plot wiki
wikiTraces = plots.plotSeriesWithRegression(
    control_hits, wiki_hits, name='Wikipedia Categories', color=color_strings[1])
w2vTraces = plots.plotSeriesWithRegression(
    control_hits, wiki_hits, name='Word2Vec', color=color_strings[2])

plot = [traceControl] + w2vTraces + wikiTraces

layout = go.Layout(
    title='Effect of query rewriting on search engine hits (edX)',
    xaxis=dict(
        title='# hits before rewriting'
    ),
    yaxis=dict(
        title='# hits after rewriting'
    )
)

fig = go.Figure(data=plot, layout=layout)

# Plot and embed in ipython notebook!
py.iplot(fig)

In [None]:
# more stats
rewriter_names =[
    'control',
    'wiki',
    'word2vec'
]
# series containing # of hits for each search term
data_series = [df[name] for name in rewriter_names]
average_hits = [s.mean() for s in data_series]

# now filter on just those terms where the control gives nothing

df_where_no_hits = df[df['control'] == 0]
data_series_zero = [df_where_no_hits[name] for name in rewriter_names]
average_hits_zero = [s.mean() for s in data_series_zero]


# bar chart of hits

# first trace: all search terms
rewriter_fancy_names = [
    'Control (no rewriting)',
    'Wikipedia Categories',
    'Word2Vec'
]

traceAllTerms = go.Bar(
    x=rewriter_fancy_names,
    y=average_hits,
    name='All terms'
)
traceJustMisses = go.Bar(
    x=rewriter_fancy_names,
    y=average_hits_zero,
    name='Terms where no hits by default'
)

traces = [traceAllTerms, traceJustMisses]
layout = go.Layout(
    barmode='group',
    title='Average hits per rewriter (edX)',
    xaxis=dict(
        title='Query rewriter'
    ),
    yaxis=dict(
        title='Average # hits'
    )
)

fig = go.Figure(data=traces, layout=layout)
py.iplot(fig)

In [None]:
# summarize it all into a df

# each row = a rewriter, each col = a condition (all terms, terms w/ no hits by default)
# entry = mean



In [8]:
stats.summary_of_frame(df)

AttributeError: 'function' object has no attribute 'summary_of_frame'

In [None]:
misses_df = df[df['control'] == 0]

In [None]:
stats.summary_of_frame(misses_df)

In [None]:
non_failures = df[(df['control'] > 0) | (df['wiki'] > 0) | (df['word2vec'] > 0)]

In [None]:
stats.summary_of_frame(non_failures)

In [None]:
non_failures_zero = non_failures[non_failures['control'] == 0]

In [None]:
stats.summary_of_frame(non_failures_zero)

In [None]:
# TODO add error bars
# TODO try better queries maybe??