In [1]:
import sys
import json
import pandas as pd
sys.path.append('../')
from pandarec import Recipe, Recommender
from pandarec.strategies import NameSearch, FuzzySearchName, FuzzySearchDescription, IndexSearch, SemanticSearch, OpenAIEmbeddings
import pandas as pd

In [17]:
df = pd.read_csv('test_data.csv', sep=';', names=["name", "query", "docstring_correct", "cookbook_correct", "snippets_correct"])

In [18]:
df

Unnamed: 0,name,query,docstring_correct,cookbook_correct,snippets_correct
0,read file,read a file to a pandas dataframe,,,1241.0
1,head,show the first or last 5 rows,"head,tail","head,tail",89.0
2,columns,show the column names,columns,columns,
3,drop,drop unnecessary columns,drop,drop,24.0
4,len,get the length of the dataframe,,,
5,query,show rows that meet a condition,,,51.0
6,iloc,get a subset of the dataframe,iloc,select rows by integer location,1920.0
7,dtypes,show the types of the columns,dtypes,dtypes,
8,select dtypes,only show certain data types,,,
9,insert,insert a column in the specified position,insert,,


In [5]:
recipes_dict = {}

with open('../recipes/from_docstrings/recipes.json') as f:
    recipes = json.load(f)

recipes_dict["docstring"] = [Recipe.from_dict(r) for r in recipes]

with open('../recipes/from_cookbook/recipes.json') as f:
    recipes = json.load(f)

recipes_dict["cookbook"] = [Recipe.from_dict(r) for r in recipes["recipes"]]

with open('../recipes/from_snippets/recipes.json') as f:
    recipes = json.load(f)

recipes_dict["snippets"] = [Recipe.from_dict(r) for r in recipes]

recipes = recipes_dict

In [6]:
api_key = open("../examples/api_key.txt", "r").read()

In [7]:
import openai
openai.api_key = api_key

In [10]:
recommenders_dict = {}
for strategy_name, recipe in recipes.items():
    print(strategy_name)
    recommenders_dict[strategy_name] = []
    recommenders_dict[strategy_name].append(Recommender(recipe, df, NameSearch()))
    recommenders_dict[strategy_name].append(Recommender(recipe, df, FuzzySearchName()))
    recommenders_dict[strategy_name].append(Recommender(recipe, df, FuzzySearchDescription()))
    recommenders_dict[strategy_name].append(Recommender(recipe, df, IndexSearch(recipe)))
    recommenders_dict[strategy_name].append(Recommender(recipe, df, SemanticSearch(recipe)))
    recommenders_dict[strategy_name].append(Recommender(recipe, df, OpenAIEmbeddings(recipe)))

docstring
cookbook
snippets


In [11]:
def strategy_name(recommender):
    return recommender.strategy.__class__.__name__

In [19]:
def get_recommender_result(recommender, query):
    recommender.set_search(query)
    recommender.recommend()
    results = recommender.show_results(5)
    results = [r.recipe.name for r in results]
    return results

for name, recommenders in recommenders_dict.items():
    for recommender in recommenders:
        print(name + "_" + strategy_name(recommender))
        df[name + "_" + strategy_name(recommender)] = df['query'].apply(lambda x: get_recommender_result(recommender, x))

docstring_NameSearch
docstring_FuzzySearchName
docstring_FuzzySearchDescription
docstring_IndexSearch
docstring_SemanticSearch
docstring_OpenAIEmbeddings
cookbook_NameSearch
cookbook_FuzzySearchName
cookbook_FuzzySearchDescription
cookbook_IndexSearch
cookbook_SemanticSearch
cookbook_OpenAIEmbeddings
snippets_NameSearch
snippets_FuzzySearchName
snippets_FuzzySearchDescription
snippets_IndexSearch
snippets_SemanticSearch
snippets_OpenAIEmbeddings


In [21]:
import math

def ndcg(correct, result, n=5):
    if pd.isnull(correct):
        return pd.NA
    correct = correct.split(',')
    score = 0
    ideal_score = 0
    for i in range(n):
        if i < len(result):
            if result[i] in correct:
                score += 1 / math.log((i + 2), 2)
        if i < len(correct):
            ideal_score += 1 / math.log((i + 2), 2)
    return score / ideal_score

In [26]:
df.to_csv('test_data_results_scored.csv', sep=';', index=False)

In [35]:
for name, recommenders in recommenders_dict.items():
    for recommender in recommenders:
        df[name + "_" + strategy_name(recommender) + '_ndcg'] = df.apply(lambda x: ndcg(x[name + '_correct'], x[name + "_" + strategy_name(recommender)]), axis=1)

In [45]:
def get_recipe_name(recipeid):
    for recipe in recipes_dict["snippets"]:
        if recipe.id == recipeid:
            return recipe.name
        

def ndcg_snippets(correct, result, n=5):
    if pd.isnull(correct):
        return pd.NA
    correct = correct.split(',')
    correct = [get_recipe_name(int(r)) for r in correct]
    score = 0
    ideal_score = 0
    for i in range(n):
        if i < len(result):
            if result[i] in correct:
                score += 1 / math.log((i + 2), 2)
        if i < len(correct):
            ideal_score += 1 / math.log((i + 2), 2)
    return score / ideal_score

for recommender in recommenders_dict["snippets"]:
    df["snippets_" + strategy_name(recommender) + '_ndcg'] = df.apply(lambda x: ndcg_snippets(x["snippets_correct"], x["snippets_" + strategy_name(recommender)]), axis=1)

In [46]:

for name, recommenders in recommenders_dict.items():
    for recommender in recommenders:
        print(name + "_" + strategy_name(recommender))
        print(df[name + "_" + strategy_name(recommender) + '_ndcg'].mean())

# OpenAIEmbeddings
# 0.5468005259704043

docstring_NameSearch
0.0
docstring_FuzzySearchName
0.40588312722234493
docstring_FuzzySearchDescription
0.26632038247967615
docstring_IndexSearch
0.22197028270288505
docstring_SemanticSearch
0.6441994167487021
docstring_OpenAIEmbeddings
0.570574461882161
cookbook_NameSearch
0.0
cookbook_FuzzySearchName
0.4460541379407055
cookbook_FuzzySearchDescription
0.31153535871478044
cookbook_IndexSearch
0.4001964340658472
cookbook_SemanticSearch
0.7060890392971821
cookbook_OpenAIEmbeddings
0.4375
snippets_NameSearch
0.0
snippets_FuzzySearchName
0.5342322812687954
snippets_FuzzySearchDescription
0.4818513940440455
snippets_IndexSearch
0.2928133535679022
snippets_SemanticSearch
0.798655615880656
snippets_OpenAIEmbeddings
0.8767653203507382


In [47]:
recipe_names = ["docstring", "cookbook", "snippets"]
strategy_names = ["NameSearch", "FuzzySearchName", "FuzzySearchDescription", "IndexSearch", "SemanticSearch", "OpenAIEmbeddings"]
ndcg_results = []
for recipe_name in recipe_names:
    for strategy_name in strategy_names:
        ndcg_results.append(df[recipe_name + "_" + strategy_name + '_ndcg'].mean())

In [50]:
result_df = pd.DataFrame(columns=strategy_names, index=recipe_names, data=[ndcg_results[:6], ndcg_results[6:12], ndcg_results[12:18]])

In [56]:
print(result_df.T.to_latex(float_format="%.2f"))

\begin{tabular}{lrrr}
\toprule
 & docstring & cookbook & snippets \\
\midrule
NameSearch & 0.00 & 0.00 & 0.00 \\
FuzzySearchName & 0.41 & 0.45 & 0.53 \\
FuzzySearchDescription & 0.27 & 0.31 & 0.48 \\
IndexSearch & 0.22 & 0.40 & 0.29 \\
SemanticSearch & 0.64 & 0.71 & 0.80 \\
OpenAIEmbeddings & 0.57 & 0.44 & 0.88 \\
\bottomrule
\end{tabular}



In [62]:
not_nan_count = {}
for recipe_name in recipe_names:
    not_nan_count[recipe_name] = df[recipe_name + "_correct"].notna().sum()
print(not_nan_count)

{'docstring': 23, 'cookbook': 8, 'snippets': 13}


In [63]:
total_count = {}
for recipe_name in recipe_names:
    total_count[recipe_name] = df[recipe_name + "_correct"].dropna().apply(lambda x: len(x.split(','))).sum()
    total_count[recipe_name] = total_count[recipe_name] / not_nan_count[recipe_name]
print(total_count)

{'docstring': 1.0869565217391304, 'cookbook': 1.125, 'snippets': 1.7692307692307692}


In [39]:
df

Unnamed: 0,name,query,docstring_correct,cookbook_correct,snippets_correct,docstring_NameSearch,docstring_FuzzySearchName,docstring_FuzzySearchDescription,docstring_IndexSearch,docstring_SemanticSearch,...,cookbook_FuzzySearchDescription_ndcg,cookbook_IndexSearch_ndcg,cookbook_SemanticSearch_ndcg,cookbook_OpenAIEmbeddings_ndcg,snippets_NameSearch_ndcg,snippets_FuzzySearchName_ndcg,snippets_FuzzySearchDescription_ndcg,snippets_IndexSearch_ndcg,snippets_SemanticSearch_ndcg,snippets_OpenAIEmbeddings_ndcg
0,read file,read a file to a pandas dataframe,,,1241.0,[],"[to_clipboard, to_csv, to_dict, to_excel, to_f...","[backfill, shift, tz_localize, abs, agg]","[to_latex, describe, info, to_csv, to_excel]","[to_hdf, to_markdown, to_csv, to_clipboard, to...",...,,,,,0.0,0.0,0.0,0.0,0.0,0.0
1,head,show the first or last 5 rows,"head,tail","head,tail",89.0,[],"[first, last, combine_first, first_valid_index...","[drop_duplicates, fillna, first, head, tail]","[asof, first, groupby, head, hist]","[tail, head, to_string, last, truncate]",...,1.0,0.570642,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,columns,show the column names,columns,columns,,[],"[cov, rename, cummin, keys, min]","[add_prefix, add_suffix, append, astype, at_time]","[compare, groupby, info, plot, to_string]","[to_latex, to_string, add_prefix, add_suffix, ...",...,0.430677,1.0,1.0,1.0,,,,,,
3,drop,drop unnecessary columns,drop,drop,24.0,[],"[drop, drop_duplicates, dropna, cummin, ne]","[asof, astype, autocorr, compare, convert_dtypes]","[drop, resample, reset_index, add_prefix, add_...","[droplevel, dropna, drop, reset_index, truncate]",...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,len,get the length of the dataframe,,,,[],"[rename, to_frame, head, asfreq, mean]","[add_prefix, add_suffix, asfreq, asof, astype]","[all, between_time, corr, cov, describe]","[tail, head, info, count, nunique]",...,,,,,,,,,,
5,query,show rows that meet a condition,,,51.0,[],"[rpow, at_time, last, count, round]","[loc, isin, add, astype, filter]","[plot, compare, groupby, info, loc]","[filter, head, isin, all, any]",...,,,,,0.0,0.0,0.0,0.0,0.0,0.0
6,iloc,get a subset of the dataframe,iloc,select rows by integer location,1920.0,[],"[rename, to_frame, head, rsub, asfreq]","[pct_change, align, any, append, apply]","[all, asof, between_time, describe, explode]","[filter, head, xs, truncate, loc]",...,0.0,0.0,0.386853,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,dtypes,show the types of the columns,dtypes,dtypes,,[],"[asof, astype, cummin, value_counts, convert_d...","[append, to_csv, to_list, abs, agg]","[groupby, rank, align, astype, compare]","[to_latex, convert_dtypes, plot, to_string, info]",...,0.0,0.0,0.63093,0.5,,,,,,
8,select dtypes,only show certain data types,,,,[],"[astype, tail, keys, convert_dtypes, align]","[append, dot, info, abs, add]","[groupby, rank, truncate, abs, add]","[convert_dtypes, to_string, infer_objects, to_...",...,,,,,,,,,,
9,insert,insert a column in the specified position,insert,,,[],"[cummin, sort_values, set_axis, to_string, kur...","[mad, any, sort_index, between_time, drop]","[ewm, loc, plot, take, to_latex]","[searchsorted, add_prefix, to_excel, add_suffi...",...,,,,,,,,,,
