In [3]:
import pymongo

In [40]:
import pandas as pd

def group_by(df, bycols, agg_map):
    """

    @param df:      DataFrame
    @param bycols:  str or list
                        Column(s) to group by
    @param agg_map: dictionary or list of 2-tuples
                        Mapping from column to aggregate function e.g. [("city", "count"), ("salary", "mean"]
    @return:        DataFrame
                        Flattened dataframe, with multi-level index removed
    """
    grps = []
    if type(bycols) == str:
        bycols = [bycols]

    if type(agg_map) == dict:
        agg_map = agg_map.items()

    for k,v in agg_map:
        grp = df[bycols + [k]].groupby(bycols, ).agg(v)
        grp.reset_index(inplace=True)
        grp["%s(%s)" % (v,k)] = grp[k]
        del grp[k]
        grps.append(grp)

    m = grps[0]
    for grp in grps[1:]:
        m = pd.merge(m, grp, on=bycols, how="inner")
    return m

In [4]:
client = pymongo.MongoClient()
db = client.metrics
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), u'metrics')

In [5]:
collections = ["CB_TAGGING_VD_FEAT_SELECTION", "SC_TAGGING_VD_FEAT_SELECTION"]

In [47]:
cb = db[collections[0]]
sc = db[collections[1]]
cb, sc

(Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), u'metrics'), u'CB_TAGGING_VD_FEAT_SELECTION'),
 Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), u'metrics'), u'SC_TAGGING_VD_FEAT_SELECTION'))

# Query Top Features

In [29]:
from pprint import pprint
from bson.son import SON # needed to ensure dictionary is ordered (python default is not)

feats_pipeline = [{
    "$project": { 
            "weighted_f1_score":"$WEIGHTED_MEAN_CONCEPT_CODES.f1_score",
            "micro_f1_score": "$MICRO_F1.f1_score",
            "window_size":    "$parameters.window_size",
            "feats":          "$parameters.extractors",
            "count": {        "$size" : "$parameters.extractors" },
            "_id":0
    }
},
{
    "$match":{
        "micro_f1_score": { "$exists" : True },
        # how many feats
        #"count": {          "$eq" :1 },
        # window width
        #"window_size": {    "$eq":13 }
    }
},
{
    "$sort":{
        #"weighted_f1_score":-1,
        "micro_f1_score": -1
        #"asof": -1
        #"count": -1
    }
},
]

rows = [row for row in cb.aggregate(feats_pipeline)]

In [30]:
df = pd.DataFrame(rows).sort_values("micro_f1_score", ascending=False)
df

Unnamed: 0,count,feats,micro_f1_score,weighted_f1_score,window_size
0,6,"[fn_pos_wd_feats_stemmed[offset:5], fn_pos_ngr...",0.828318,0.822510,11
1,6,"[fn_pos_wd_feats_stemmed[offset:6], fn_pos_ngr...",0.827998,0.821827,13
2,6,"[fn_pos_wd_feats_stemmed[offset:4], fn_pos_ngr...",0.827891,0.822128,9
3,6,"[fn_pos_wd_feats_stemmed[offset:4], fn_pos_ngr...",0.827796,0.821831,9
4,5,"[fn_pos_wd_feats_stemmed[offset:4], fn_pos_ngr...",0.827714,0.821858,9
5,6,"[fn_pos_wd_feats_stemmed[offset:5], fn_pos_ngr...",0.827713,0.821803,11
6,6,"[fn_pos_wd_feats_stemmed[offset:6], fn_pos_ngr...",0.827645,0.821416,13
7,5,"[fn_pos_wd_feats_stemmed[offset:5], fn_pos_ngr...",0.827558,0.821638,11
8,6,"[fn_pos_wd_feats_stemmed[offset:4], fn_pos_ngr...",0.827528,0.821848,9
9,6,"[fn_pos_wd_feats_stemmed[offset:5], fn_pos_ngr...",0.827457,0.821547,11


In [33]:
pprint(df.iloc[0])
df.iloc[0]["feats"]

count                                                                6
feats                [fn_pos_wd_feats_stemmed[offset:5], fn_pos_ngr...
micro_f1_score                                                0.828318
weighted_f1_score                                              0.82251
window_size                                                         11
Name: 0, dtype: object


[u'fn_pos_wd_feats_stemmed[offset:5]',
 u'fn_pos_ngram_feat_stemmed[ngram_size:3 offset:5]',
 u'fn_bow_ngram_feat[ngram_size:1 offset:5]',
 u'fn_pos_ngram_feat_stemmed[ngram_size:2 offset:5]',
 u'extract_brown_cluster',
 u'extract_dependency_relation']

In [54]:
df_top_individual_feats = df[(df["count"] == 1) & (df["window_size"] == 11)]
df_top_individual_feats["feats,count,micro_f1_score,window_size".split(",")]

Unnamed: 0,feats,micro_f1_score
210,[fn_pos_wd_feats_stemmed[offset:5]],0.817206
248,[fn_pos_wd_feats[offset:5]],0.811229
446,[fn_pos_ngram_feat_stemmed[ngram_size:2 offset...,0.779349
500,[fn_pos_ngram_feat[ngram_size:2 offset:5]],0.762352
585,[fn_pos_ngram_feat_stemmed[ngram_size:3 offset...,0.717195
637,[fn_pos_ngram_feat[ngram_size:3 offset:5]],0.69289
692,[fn_bow_ngram_feat[ngram_size:2 offset:5]],0.568969
700,[extract_dependency_relation],0.565484
705,[fn_bow_ngram_feat[ngram_size:1 offset:5]],0.558441
707,[fn_bow_ngram_feat[ngram_size:3 offset:5]],0.527211


In [57]:
zip(df_top_individual_feats["feats"].values,map(lambda r: round(r,4), df_top_individual_feats["micro_f1_score"].values))

[([u'fn_pos_wd_feats_stemmed[offset:5]'], 0.8172),
 ([u'fn_pos_wd_feats[offset:5]'], 0.8112),
 ([u'fn_pos_ngram_feat_stemmed[ngram_size:2 offset:5]'], 0.7793),
 ([u'fn_pos_ngram_feat[ngram_size:2 offset:5]'], 0.7624),
 ([u'fn_pos_ngram_feat_stemmed[ngram_size:3 offset:5]'], 0.7172),
 ([u'fn_pos_ngram_feat[ngram_size:3 offset:5]'], 0.6929),
 ([u'fn_bow_ngram_feat[ngram_size:2 offset:5]'], 0.569),
 ([u'extract_dependency_relation'], 0.5655),
 ([u'fn_bow_ngram_feat[ngram_size:1 offset:5]'], 0.5584),
 ([u'fn_bow_ngram_feat[ngram_size:3 offset:5]'], 0.5272),
 ([u'extract_brown_cluster'], 0.3596),
 ([u'fn_pos_POS_feats[offset:5]'], 0.2407),
 ([u'fn_bow_POS_feats[offset:5]'], 0.0601)]

## Perfomance By Window Size

In [45]:
group_by(df, bycols=["window_size"], agg_map=[("micro_f1_score", "max"),
                                              ("micro_f1_score", "mean"),
                                              ("micro_f1_score", "median"),
                                              ("micro_f1_score", "count"),
                                             ]).sort_values("window_size")

Unnamed: 0,window_size,max(micro_f1_score),mean(micro_f1_score),median(micro_f1_score),count(micro_f1_score)
0,1,0.709962,0.601661,0.703455,54
1,3,0.777346,0.714389,0.756341,126
2,5,0.812873,0.754918,0.789516,126
3,7,0.825479,0.735753,0.800847,161
4,9,0.827891,0.735933,0.804666,112
5,11,0.828318,0.770595,0.82206,63
6,13,0.827998,0.768894,0.821435,63
7,15,0.825786,0.765408,0.819748,63


In [46]:
group_by(df, bycols=["count"], agg_map=[("micro_f1_score", "max"),
                                              ("micro_f1_score", "mean"),
                                              ("micro_f1_score", "median"),
                                              ("micro_f1_score", "count"),
                                             ]).sort_values("count")

Unnamed: 0,count,max(micro_f1_score),mean(micro_f1_score),median(micro_f1_score),count(micro_f1_score)
0,1,0.817206,0.566743,0.647602,202
1,2,0.823682,0.782025,0.798293,146
2,3,0.826363,0.793096,0.80362,131
3,4,0.827305,0.797654,0.810258,109
4,5,0.827714,0.800303,0.812181,96
5,6,0.828318,0.802202,0.812783,84
