In [1]:
import pandas as pd
import os
import wiggum as wg
import numpy as np

We'll first load in some data, this has both regression and rate type trends, since this file has a weird index, we'll load it in as dataframe first

In [2]:
hit_search_rate = pd.read_csv('../data/state_hit_rate_min_cols_COCTFLILMDMAMOMTNENCOHRISCTXVTWAWI.csv',index_col='Unnamed: 0')
hit_search_rate.head()

Unnamed: 0,state,year,driver_gender,driver_race,decriminalization,medical,recreational,no_reforms,search_conducted_false,search_conducted_true,search_conducted_rate,contraband_found_false,contraband_found_true,contraband_found_rate,hit_false,hit_true,hit_rate,num_stops
0,AZ,2009,F,White,0,0,0,1,30.0,1.0,0.032258,31.0,,,1.0,,,31.0
1,AZ,2009,M,Black,0,0,0,1,1.0,2.0,0.666667,2.0,1.0,0.333333,1.0,1.0,0.5,3.0
2,AZ,2009,M,Hispanic,0,0,0,1,14.0,1.0,0.066667,15.0,,,1.0,,,15.0
3,AZ,2009,M,White,0,0,0,1,32.0,4.0,0.111111,34.0,2.0,0.055556,2.0,2.0,0.5,36.0
4,AZ,2010,F,Asian,0,1,0,0,58.0,3.0,0.04918,61.0,,,3.0,,,61.0


We can now use the LabeledDataFrame with that DataFrame to create the object

In [3]:
labeled_df_setup = wg.LabeledDataFrame(hit_search_rate)

Next, we can infer the variable types

In [4]:
labeled_df_setup.infer_var_types()
labeled_df_setup.meta_df.head()

Unnamed: 0_level_0,dtype,var_type,role,isCount,weighting_var
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
state,object,categorical,,,
year,int64,ordinal,,,
driver_gender,object,binary,,,
driver_race,object,categorical,,,
decriminalization,int64,binary,,,


For this, we'll manually set these, but in the vizualization tool you can also set these with drop down menus

In [5]:
roles = {'state':['trend','groupby'], 'year':'trend', 'driver_gender':['trend','groupby'], 
         'driver_race':['trend','groupby'],
       'decriminalization':['groupby'], 'medical':['groupby'],
         'recreational':['groupby'], 'no_reforms':['groupby'],
       'search_conducted_false':'ignore', 'search_conducted_true':'ignore',
       'search_conducted_rate':'trend', 'contraband_found_false':'ignore',
       'contraband_found_true':'ignore', 'contraband_found_rate':'trend', 'hit_false':'ignore',
       'hit_true':'ignore', 'hit_rate':'trend', 'num_stops':'trend'}
is_count = {'state':False, 'year':False, 'driver_gender':False, 'driver_race':False,
       'decriminalization':False, 'medical':False, 'recreational':False, 'no_reforms':False,
       'search_conducted_false':True, 'search_conducted_true':True,
       'search_conducted_rate':False, 'contraband_found_false':True,
       'contraband_found_true':True, 'contraband_found_rate':False, 'hit_false':True,
       'hit_true':True, 'hit_rate':False, 'num_stops':True}
count_list = ['search_conducted_false', 'search_conducted_true','contraband_found_false',
       'contraband_found_true', 'hit_false',
       'hit_true', 'num_stops']
var_types = {'driver_gender':'categorical','decriminalization':'categorical', 
             'medical':'categorical', 'recreational':'categorical', 'no_reforms':False,}
weighting = {'hit_rate':'search_conducted_true','search_conducted_rate':'num_stops',
             'contraband_found_rate':'num_stops'}

We'll set those next.  Above gives examples of two ways that we can specify the count values to pass them to the set_counts function, but we'll only call it once below. 

In [6]:
labeled_df_setup.set_counts(count_list)
labeled_df_setup.set_roles(roles)
labeled_df_setup.set_weighting_vars(weighting)
labeled_df_setup.set_var_types(var_types)
labeled_df_setup.meta_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.meta_df['role'][k] = v
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.meta_df['var_type'][k] = v


Unnamed: 0_level_0,dtype,var_type,role,isCount,weighting_var
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
state,object,categorical,"[trend, groupby]",False,
year,int64,ordinal,trend,False,
driver_gender,object,categorical,"[trend, groupby]",False,
driver_race,object,categorical,"[trend, groupby]",False,
decriminalization,int64,categorical,[groupby],False,
medical,int64,categorical,[groupby],False,
recreational,int64,categorical,[groupby],False,
no_reforms,int64,False,[groupby],False,
search_conducted_false,float64,continuous,ignore,True,
search_conducted_true,float64,continuous,ignore,True,


Now, we've set this up, we can also save these configurations to load them in directly in the future

In [7]:
labeled_df_setup.to_csvs('../data/ldf_state_hit_rate_min_cols_COCTFLILMDMAMOMTNENCOHRISCTXVTWAWI')

We can see what this does, using a bash magic

In [8]:
%%bash
cd ../data/ldf_state_hit_rate_min_cols_COCTFLILMDMAMOMTNENCOHRISCTXVTWAWI
ls

df.csv
meta.csv
result_df.csv


it write the three DataFrames each out to their own .csv file in that directory. If that directory exists it will overwrite without warning, if not, also creates the directory. 

Now, we can can also load the data back

In [9]:
labeled_df = wg.LabeledDataFrame('../data/ldf_state_hit_rate_min_cols_COCTFLILMDMAMOMTNENCOHRISCTXVTWAWI')
labeled_df.meta_df

Unnamed: 0_level_0,dtype,var_type,role,isCount,weighting_var
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
state,object,categorical,"[trend, groupby]",False,
year,int64,ordinal,[trend],False,
driver_gender,object,categorical,"[trend, groupby]",False,
driver_race,object,categorical,"[trend, groupby]",False,
decriminalization,int64,categorical,[groupby],False,
medical,int64,categorical,[groupby],False,
recreational,int64,categorical,[groupby],False,
no_reforms,int64,False,[groupby],False,
search_conducted_false,float64,continuous,[ignore],True,
search_conducted_true,float64,continuous,[ignore],True,


In [10]:
labeled_df.add_all_dpgmm(qual_thresh =.2)
labeled_df.df.head()



Unnamed: 0,state,year,driver_gender,driver_race,decriminalization,medical,recreational,no_reforms,search_conducted_false,search_conducted_true,...,hit_false,hit_true,hit_rate,num_stops,search_conducted_false_search_conducted_true_dpgmm,search_conducted_false_contraband_found_false_dpgmm,search_conducted_false_num_stops_dpgmm,search_conducted_true_contraband_found_false_dpgmm,search_conducted_true_num_stops_dpgmm,contraband_found_false_num_stops_dpgmm
0,AZ,2009,F,White,0,0,0,1,30.0,1.0,...,1.0,,,31.0,11,0,0,5,5,0
1,AZ,2009,M,Black,0,0,0,1,1.0,2.0,...,1.0,1.0,0.5,3.0,11,0,0,5,5,0
2,AZ,2009,M,Hispanic,0,0,0,1,14.0,1.0,...,1.0,,,15.0,11,0,0,5,5,0
3,AZ,2009,M,White,0,0,0,1,32.0,4.0,...,2.0,2.0,0.5,36.0,11,0,0,5,5,0
4,AZ,2010,F,Asian,0,1,0,0,58.0,3.0,...,3.0,,,61.0,11,0,0,5,5,0


In [11]:
labeled_df.meta_df

Unnamed: 0,dtype,var_type,role,isCount,weighting_var
state,object,categorical,"[trend, groupby]",False,
year,int64,ordinal,[trend],False,
driver_gender,object,categorical,"[trend, groupby]",False,
driver_race,object,categorical,"[trend, groupby]",False,
decriminalization,int64,categorical,[groupby],False,
medical,int64,categorical,[groupby],False,
recreational,int64,categorical,[groupby],False,
no_reforms,int64,False,[groupby],False,
search_conducted_false,float64,continuous,[ignore],True,
search_conducted_true,float64,continuous,[ignore],True,


In [12]:
labeled_df.add_quantile(['hit_rate','num_stops'])

Unnamed: 0,state,year,driver_gender,driver_race,decriminalization,medical,recreational,no_reforms,search_conducted_false,search_conducted_true,...,hit_rate,num_stops,search_conducted_false_search_conducted_true_dpgmm,search_conducted_false_contraband_found_false_dpgmm,search_conducted_false_num_stops_dpgmm,search_conducted_true_contraband_found_false_dpgmm,search_conducted_true_num_stops_dpgmm,contraband_found_false_num_stops_dpgmm,hit_ratequantiles,num_stopsquantiles
0,AZ,2009,F,White,0,0,0,1,30.0,1.0,...,,31.0,11,0,0,5,5,0,high,low
1,AZ,2009,M,Black,0,0,0,1,1.0,2.0,...,0.500000,3.0,11,0,0,5,5,0,high,low
2,AZ,2009,M,Hispanic,0,0,0,1,14.0,1.0,...,,15.0,11,0,0,5,5,0,high,low
3,AZ,2009,M,White,0,0,0,1,32.0,4.0,...,0.500000,36.0,11,0,0,5,5,0,high,low
4,AZ,2010,F,Asian,0,1,0,0,58.0,3.0,...,,61.0,11,0,0,5,5,0,high,low
5,AZ,2010,F,Black,0,1,0,0,224.0,8.0,...,,232.0,11,0,0,5,5,0,high,low
6,AZ,2010,F,Hispanic,0,1,0,0,557.0,33.0,...,0.030303,590.0,11,0,0,5,5,0,low,low
7,AZ,2010,F,Other,0,1,0,0,167.0,6.0,...,,173.0,11,0,0,5,5,0,high,low
8,AZ,2010,F,White,0,1,0,0,3145.0,92.0,...,0.076087,3237.0,11,0,0,5,5,0,low,mid
9,AZ,2010,M,Asian,0,1,0,0,97.0,1.0,...,,98.0,11,0,0,5,5,0,high,low


In [13]:
labeled_df.meta_df

Unnamed: 0,dtype,var_type,role,isCount,weighting_var
state,object,categorical,"[trend, groupby]",False,
year,int64,ordinal,[trend],False,
driver_gender,object,categorical,"[trend, groupby]",False,
driver_race,object,categorical,"[trend, groupby]",False,
decriminalization,int64,categorical,[groupby],False,
medical,int64,categorical,[groupby],False,
recreational,int64,categorical,[groupby],False,
no_reforms,int64,False,[groupby],False,
search_conducted_false,float64,continuous,[ignore],True,
search_conducted_true,float64,continuous,[ignore],True,


# Using Trends

Trend objects define their name, how to compute the trend and how to choose which variables, 

extension will allow that the var lists may be passed to reduce which ones are computed

In [14]:
corrobj = wg.All_Pearson()
corrobj.get_trend_vars(labeled_df)
corrobj.regression_vars

['year',
 'search_conducted_rate',
 'contraband_found_rate',
 'hit_rate',
 'num_stops']

In [15]:
rankobj = wg.Mean_Rank_Trend()
linreg_obj = wg.All_Linear_Trend()

# Computing Trends on a LabeledDataFrame

There are two ways, we can use default setting and pass the names of the trend type or a trend object

In [16]:
labeled_df.get_subgroup_trends_1lev(['pearson_corr'])
labeled_df.result_df.head()

Unnamed: 0,feat1,feat2,subgroup_trend,subgroup_trend_strength,group_feat,subgroup,trend_type,agg_trend,agg_trend_strength
0,year,search_conducted_rate,-0.247018,0.247018,state,AZ,pearson_corr,-0.03903,0.03903
1,year,search_conducted_rate,-0.414566,0.414566,state,CO,pearson_corr,-0.03903,0.03903
2,year,search_conducted_rate,0.118238,0.118238,state,CT,pearson_corr,-0.03903,0.03903
3,year,search_conducted_rate,-0.199765,0.199765,state,IL,pearson_corr,-0.03903,0.03903
4,year,search_conducted_rate,-0.603026,0.603026,state,MA,pearson_corr,-0.03903,0.03903


Now we can use a list of objects and apply multiple trends

In [17]:
labeled_df.get_subgroup_trends_1lev([rankobj,linreg_obj])
labeled_df.result_df.sample(10)

  slope = r_num / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)
  curgroup_corr = cur_trend.get_trends(cur_grouping,'subgroup_trend')
  curgroup_corr = cur_trend.get_trends(cur_grouping,'subgroup_trend')
  curgroup_corr = cur_trend.get_trends(cur_grouping,'subgroup_trend')
  curgroup_corr = cur_trend.get_trends(cur_grouping,'subgroup_trend')
  curgroup_corr = cur_trend.get_trends(cur_grouping,'subgroup_trend')
  curgroup_corr = cur_trend.get_trends(cur_grouping,'subgroup_trend')
  curgroup_corr = cur_trend.get_trends(cur_grouping,'subgroup_trend')
  curgroup_corr = cur_trend.get_trends(cur_grouping,'subgroup_trend')
  curgroup_corr = cur_trend.get_trends(cur_grouping,'subgroup_trend')
  curgroup_corr = cur_trend.get_trends(cur_grouping,'subgroup_trend')
  curgroup_corr = cur_trend.get_trends(cur_grouping,'subgroup_trend')
  curgroup_corr = cur_trend.get_trends(cur_grouping,'subgroup_trend')
  curgroup_corr = cur_tre

Unnamed: 0,feat1,feat2,subgroup_trend,subgroup_trend_strength,group_feat,subgroup,trend_type,agg_trend,agg_trend_strength
1021,search_conducted_rate,driver_gender,"[F, M]",0.3058,state,CO,rank_trend,"[F, M]",0.2519
1066,search_conducted_rate,driver_gender,"[F, M]",0.0659,search_conducted_false_contraband_found_false_...,8,rank_trend,"[F, M]",0.2519
1704,num_stops,driver_gender,[M],,contraband_found_false_num_stops_dpgmm,9,rank_trend,"[F, M]",0.1164
560,search_conducted_rate,hit_rate,-0.33525,0.33525,search_conducted_false_contraband_found_false_...,10,pearson_corr,-0.0942726,0.094273
310,year,num_stops,0.115809,0.115809,state,MA,pearson_corr,-0.043965,0.043965
1177,search_conducted_rate,driver_race,"[Black, White, Hispanic]",0.2143,search_conducted_false_num_stops_dpgmm,10,rank_trend,"[Asian, White, Other, Black, Hispanic]",0.2772
2175,year,num_stops,-5881.23,0.102666,state,WA,lin_reg,-1560.86,0.043965
2147,num_stops,state,"[SC, IL, AZ, WA, MD]",0.2756,search_conducted_true_num_stops_dpgmm,10,rank_trend,"[RI, VT, WI, CT, MD, MA, AZ, CO, IL, NC, WA, S...",0.3209
513,search_conducted_rate,hit_rate,-0.200206,0.200206,state,IL,pearson_corr,-0.0942726,0.094273
593,search_conducted_rate,hit_rate,0.239385,0.239385,search_conducted_true_num_stops_dpgmm,8,pearson_corr,-0.0942726,0.094273


These two methods give the same, the string based version allows for simple access to default setting but passing a trend object would allow for overriding defaults and creating more custom subests of trends.

We can see what types of trends were computed from `result_df`

In [18]:
pd.unique(labeled_df.result_df['trend_type'])

array(['pearson_corr', 'rank_trend', 'lin_reg'], dtype=object)

The object also stores the trend objects that have been applied, they can be used for mapping to get the distance functions that are appropriate for each trend

In [19]:
labeled_df.trend_list

[<wiggum.trends.All_Pearson at 0x7fa5f9bf8518>,
 <wiggum.trends.Mean_Rank_Trend at 0x7fa5f9bf8358>,
 <wiggum.trends.All_Linear_Trend at 0x7fa5f9bf8550>]

In [20]:
# labeled_df.result_df['distance'] = labeled_df.result_df.apply(dist_helper,axis=1)
labeled_df.add_distance()

labeled_df.result_df.sample(10)

Unnamed: 0,feat1,feat2,subgroup_trend,subgroup_trend_strength,group_feat,subgroup,trend_type,agg_trend,agg_trend_strength,distance
801,contraband_found_rate,hit_rate,-0.196262,0.196262,contraband_found_false_num_stops_dpgmm,1,pearson_corr,0.275397,0.275397,1.0
1444,hit_rate,driver_gender,"[F, M]",0.062,search_conducted_false_search_conducted_true_d...,4,rank_trend,"[F, M]",0.0433,-0.5
798,contraband_found_rate,hit_rate,1,1.0,search_conducted_true_num_stops_dpgmm,9,pearson_corr,0.275397,0.275397,0.0
1758,num_stops,driver_race,"[Other, Asian, Black, Hispanic, White]",0.0381,search_conducted_false_num_stops_dpgmm,0,rank_trend,"[Other, Asian, Black, Hispanic, White]",0.4996,-0.5
43,year,search_conducted_rate,0.0843589,0.084359,search_conducted_false_contraband_found_false_...,3,pearson_corr,-0.0390297,0.03903,1.0
2623,num_stops,search_conducted_rate,0.0573308,118.624056,search_conducted_false_contraband_found_false_...,11,lin_reg,0.0184825,82.467302,0.024693
714,contraband_found_rate,hit_rate,0.621241,0.621241,state,AZ,pearson_corr,0.275397,0.275397,0.0
1448,hit_rate,driver_gender,"[F, M]",0.3577,search_conducted_false_search_conducted_true_d...,8,rank_trend,"[F, M]",0.0433,-0.5
1263,contraband_found_rate,driver_gender,"[F, M]",0.1283,search_conducted_false_contraband_found_false_...,8,rank_trend,"[F, M]",0.1797,-0.5
756,contraband_found_rate,hit_rate,0.517079,0.517079,search_conducted_false_contraband_found_false_...,2,pearson_corr,0.275397,0.275397,0.0


Each trend object has a trend_precompute dictionary as a property that stores the intermediate values (tables of the weighted rates for ranks and correlation matrices for pearson correlation, TODO: what do we need for linreg). These can be used in vizualization.

In [21]:
labeled_df.trend_list[0].trend_precompute

{'pearson_corr_agg_trend':                            year  search_conducted_rate  contraband_found_rate  \
 year                   1.000000              -0.039030               0.041129   
 search_conducted_rate -0.039030               1.000000               0.807915   
 contraband_found_rate  0.041129               0.807915               1.000000   
 hit_rate               0.316196              -0.094273               0.275397   
 num_stops             -0.043965              -0.065594              -0.073456   
 
                        hit_rate  num_stops  
 year                   0.316196  -0.043965  
 search_conducted_rate -0.094273  -0.065594  
 contraband_found_rate  0.275397  -0.073456  
 hit_rate               1.000000  -0.042102  
 num_stops             -0.042102   1.000000  ,
 'pearson_corr_subgroup_trend':                                               year  search_conducted_rate  \
 num_stopsquantiles                                                          
 high           

# Filtering

In [22]:
help(labeled_df.get_trend_rows)

Help on method get_trend_rows in module wiggum.ranking_processing:

get_trend_rows(feat1=None, feat2=None, group_feat=None, subgroup=None, trend_type=None) method of wiggum.labeled_dataframe.LabeledDataFrame instance
    return a row of result_df based on the specified values. returned rows
    meet provided criteria for all columns (and operator) and any one of the listed
    values for each column (or operator)
    
    Parameters
    -----------
    feat1 : str, list, or  None
        trend variable name or None to include all
    feat2 : str, list, or  None
        trend variable name or None to include all
    group_feat : str, list, or  None
        groupoby variable name or None to include all
    subgroup : str, list, or  None
        value of groupby_feat or or None to include all



So, we can use that function to filter and look at subsets of the trends based on the features, groupby, or subgroups

In [23]:
labeled_df.get_trend_rows(feat1='year',subgroup=['Black','Hispanic'])

12  total rows meet the criteria


Unnamed: 0,feat1,feat2,subgroup_trend,subgroup_trend_strength,group_feat,subgroup,trend_type,agg_trend,agg_trend_strength,distance
16,year,search_conducted_rate,-0.0391739,0.039174,driver_race,Black,pearson_corr,-0.0390297,0.03903,0.0
17,year,search_conducted_rate,-0.177694,0.177694,driver_race,Hispanic,pearson_corr,-0.0390297,0.03903,0.0
118,year,contraband_found_rate,0.0408099,0.04081,driver_race,Black,pearson_corr,0.0411292,0.041129,0.0
119,year,contraband_found_rate,0.0425516,0.042552,driver_race,Hispanic,pearson_corr,0.0411292,0.041129,0.0
220,year,hit_rate,0.354374,0.354374,driver_race,Black,pearson_corr,0.316196,0.316196,0.0
221,year,hit_rate,0.329214,0.329214,driver_race,Hispanic,pearson_corr,0.316196,0.316196,0.0
322,year,num_stops,-0.169689,0.169689,driver_race,Black,pearson_corr,-0.043965,0.043965,0.0
323,year,num_stops,0.014732,0.014732,driver_race,Hispanic,pearson_corr,-0.043965,0.043965,1.0
2180,year,num_stops,-2262.72,0.169689,driver_race,Black,lin_reg,-1560.86,0.043965,0.000127
2181,year,num_stops,422.667,0.014732,driver_race,Hispanic,lin_reg,-1560.86,0.043965,0.998086


In [24]:
labeled_df.get_trend_rows(group_feat = 'driver_race',trend_type ='lin_reg' )

15  total rows meet the criteria


Unnamed: 0,feat1,feat2,subgroup_trend,subgroup_trend_strength,group_feat,subgroup,trend_type,agg_trend,agg_trend_strength,distance
2179,year,num_stops,259.628,0.104919,driver_race,Asian,lin_reg,-1560.86,0.043965,0.99714
2180,year,num_stops,-2262.72,0.169689,driver_race,Black,lin_reg,-1560.86,0.043965,0.000127
2181,year,num_stops,422.667,0.014732,driver_race,Hispanic,lin_reg,-1560.86,0.043965,0.998086
2182,year,num_stops,91.5127,0.07518,driver_race,Other,lin_reg,-1560.86,0.043965,0.992636
2183,year,num_stops,-5591.73,0.097383,driver_race,White,lin_reg,-1560.86,0.043965,0.000294
2281,year,search_conducted_rate,1.24187,0.003783,driver_race,Asian,lin_reg,1.96425,0.004473,0.131795
2282,year,search_conducted_rate,1.24021,0.001744,driver_race,Black,lin_reg,1.96425,0.004473,0.132212
2283,year,search_conducted_rate,5.34096,0.017419,driver_race,Hispanic,lin_reg,1.96425,0.004473,0.181954
2284,year,search_conducted_rate,-7.23536,0.032087,driver_race,Other,lin_reg,1.96425,0.004473,0.612781
2285,year,search_conducted_rate,1.9543,0.003678,driver_race,White,lin_reg,1.96425,0.004473,0.001309


We can also filter based on SP detections with `

In [25]:
labeled_df.get_SP_rows(thresh=.2)

Unnamed: 0,feat1,feat2,subgroup_trend,subgroup_trend_strength,group_feat,subgroup,trend_type,agg_trend,agg_trend_strength,distance,SP_thresh0.2
2,year,search_conducted_rate,0.118238,0.118238,state,CT,pearson_corr,-0.0390297,0.039030,1.000000,True
8,year,search_conducted_rate,0.000440801,0.000441,state,SC,pearson_corr,-0.0390297,0.039030,1.000000,True
12,year,search_conducted_rate,0.0485728,0.048573,state,WI,pearson_corr,-0.0390297,0.039030,1.000000,True
13,year,search_conducted_rate,0.0304108,0.030411,driver_gender,F,pearson_corr,-0.0390297,0.039030,1.000000,True
18,year,search_conducted_rate,0.134696,0.134696,driver_race,Other,pearson_corr,-0.0390297,0.039030,1.000000,True
21,year,search_conducted_rate,0.0780123,0.078012,decriminalization,1,pearson_corr,-0.0390297,0.039030,1.000000,True
26,year,search_conducted_rate,0.0808758,0.080876,no_reforms,0,pearson_corr,-0.0390297,0.039030,1.000000,True
28,year,search_conducted_rate,0.0484918,0.048492,search_conducted_false_search_conducted_true_d...,0,pearson_corr,-0.0390297,0.039030,1.000000,True
29,year,search_conducted_rate,0.0945287,0.094529,search_conducted_false_search_conducted_true_d...,1,pearson_corr,-0.0390297,0.039030,1.000000,True
33,year,search_conducted_rate,0.299431,0.299431,search_conducted_false_search_conducted_true_d...,5,pearson_corr,-0.0390297,0.039030,1.000000,True


## Detection

Detection via `get_SP_rows` happens in two steps:
1. label the rows
2. filter by that column to return

Labeling the rows can happen in a number of ways too, the detection accepts a number of forms of input, custom detections can be built in many ways

In [26]:
help(labeled_df.label_SP_rows)

Help on method label_SP_rows in module wiggum.ranking_processing:

label_SP_rows(filter_thresh=None) method of wiggum.labeled_dataframe.LabeledDataFrame instance
    update the result_df with an additional colulmn indicateing rows with SP
    (or SP-like) as defined by sp_type
    
    Parameters
    -----------
    
    self : LabeledDataFrame
        must have values in result_df
    filter_thresh : dict or string
        dictionary of column label, threshold pairs or string name of a
        prespecified dictionary if dict, must include 'name' field (which
        will be used as the column name for storing the detections)



when filter_thresh is a dictionary, the filtering happens by taking the intersection of each row by the treshold prvided.  Some defaults are also built in accessible by string.

In [27]:
wg.trend_quality_sp

{'distance': 0.2,
 'agg_trend_strength': 0.15,
 'subgroup_trend_strength': 0.15,
 'name': 'default_qual_sp'}

Which can be applied with:

In [28]:
labeled_df.get_SP_rows('default_qual_sp')

Unnamed: 0,feat1,feat2,subgroup_trend,subgroup_trend_strength,group_feat,subgroup,trend_type,agg_trend,agg_trend_strength,distance,SP_thresh0.2,default_qual_sp
212,year,hit_rate,-0.255096,0.255096,state,SC,pearson_corr,0.316196,0.316196,1.0,True,True
215,year,hit_rate,-0.68403,0.68403,state,WA,pearson_corr,0.316196,0.316196,1.0,True,True
234,year,hit_rate,-0.92637,0.92637,search_conducted_false_search_conducted_true_d...,2,pearson_corr,0.316196,0.316196,1.0,True,True
242,year,hit_rate,-0.618712,0.618712,search_conducted_false_search_conducted_true_d...,10,pearson_corr,0.316196,0.316196,1.0,True,True
245,year,hit_rate,-0.262454,0.262454,search_conducted_false_contraband_found_false_...,1,pearson_corr,0.316196,0.316196,1.0,True,True
251,year,hit_rate,-0.557955,0.557955,search_conducted_false_contraband_found_false_...,7,pearson_corr,0.316196,0.316196,1.0,True,True
253,year,hit_rate,-1,1.0,search_conducted_false_contraband_found_false_...,9,pearson_corr,0.316196,0.316196,1.0,True,True
264,year,hit_rate,-0.618712,0.618712,search_conducted_false_num_stops_dpgmm,8,pearson_corr,0.316196,0.316196,1.0,True,True
265,year,hit_rate,-1,1.0,search_conducted_false_num_stops_dpgmm,9,pearson_corr,0.316196,0.316196,1.0,True,True
269,year,hit_rate,-0.347543,0.347543,search_conducted_true_contraband_found_false_d...,1,pearson_corr,0.316196,0.316196,1.0,True,True


In [29]:
wg.DEFAULT_SP_DEF

{'distance': 0.0, 'name': 'SP'}

Which can be applied with:

In [30]:
labeled_df.get_SP_rows('SP')

Unnamed: 0,feat1,feat2,subgroup_trend,subgroup_trend_strength,group_feat,subgroup,trend_type,agg_trend,agg_trend_strength,distance,SP_thresh0.2,default_qual_sp,SP
2,year,search_conducted_rate,0.118238,0.118238,state,CT,pearson_corr,-0.0390297,0.039030,1.000000,True,False,True
8,year,search_conducted_rate,0.000440801,0.000441,state,SC,pearson_corr,-0.0390297,0.039030,1.000000,True,False,True
12,year,search_conducted_rate,0.0485728,0.048573,state,WI,pearson_corr,-0.0390297,0.039030,1.000000,True,False,True
13,year,search_conducted_rate,0.0304108,0.030411,driver_gender,F,pearson_corr,-0.0390297,0.039030,1.000000,True,False,True
18,year,search_conducted_rate,0.134696,0.134696,driver_race,Other,pearson_corr,-0.0390297,0.039030,1.000000,True,False,True
21,year,search_conducted_rate,0.0780123,0.078012,decriminalization,1,pearson_corr,-0.0390297,0.039030,1.000000,True,False,True
26,year,search_conducted_rate,0.0808758,0.080876,no_reforms,0,pearson_corr,-0.0390297,0.039030,1.000000,True,False,True
28,year,search_conducted_rate,0.0484918,0.048492,search_conducted_false_search_conducted_true_d...,0,pearson_corr,-0.0390297,0.039030,1.000000,True,False,True
29,year,search_conducted_rate,0.0945287,0.094529,search_conducted_false_search_conducted_true_d...,1,pearson_corr,-0.0390297,0.039030,1.000000,True,False,True
33,year,search_conducted_rate,0.299431,0.299431,search_conducted_false_search_conducted_true_d...,5,pearson_corr,-0.0390297,0.039030,1.000000,True,False,True


We can also define our own detection filters, using any available column

In [31]:
lin_only_qual = {'name':'lin_only_qual_sp','distance':.2, 'agg_trend_strength':.05,
                'subgroup_trend_strength':.15,'trend_type':'lin_reg'}
labeled_df.get_SP_rows(lin_only_qual,replace=True) 

Unnamed: 0,feat1,feat2,subgroup_trend,subgroup_trend_strength,group_feat,subgroup,trend_type,agg_trend,agg_trend_strength,distance,SP_thresh0.2,default_qual_sp,SP,lin_only_qual_sp


# Ranking

In [32]:
labeled_df.rank_occurences_by_view(ascending=False).head(20)

Index(['feat1', 'feat2', 'subgroup_trend', 'subgroup_trend_strength',
       'group_feat', 'subgroup', 'trend_type', 'agg_trend',
       'agg_trend_strength', 'distance', 'SP_thresh0.2', 'default_qual_sp',
       'SP', 'lin_only_qual_sp', 'mean_view_distance'],
      dtype='object')


Unnamed: 0,feat1,feat2,subgroup_trend,subgroup_trend_strength,group_feat,subgroup,trend_type,agg_trend,agg_trend_strength,distance,SP_thresh0.2,default_qual_sp,SP,lin_only_qual_sp,mean_view_distance
1072,hit_rate,num_stops,0.139426,0.139426,state,AZ,pearson_corr,-0.0421016,0.042102,1.0,True,False,True,False,0.536082
1073,hit_rate,num_stops,0.390466,0.390466,state,CO,pearson_corr,-0.0421016,0.042102,1.0,True,False,True,False,0.536082
1074,hit_rate,num_stops,0.190372,0.190372,state,CT,pearson_corr,-0.0421016,0.042102,1.0,True,False,True,False,0.536082
1075,hit_rate,num_stops,0.297911,0.297911,state,IL,pearson_corr,-0.0421016,0.042102,1.0,True,False,True,False,0.536082
1076,hit_rate,num_stops,0.390729,0.390729,state,MA,pearson_corr,-0.0421016,0.042102,1.0,True,False,True,False,0.536082
1077,hit_rate,num_stops,0.0152053,0.015205,state,MD,pearson_corr,-0.0421016,0.042102,1.0,True,False,True,False,0.536082
1079,hit_rate,num_stops,0.218123,0.218123,state,RI,pearson_corr,-0.0421016,0.042102,1.0,True,False,True,False,0.536082
1080,hit_rate,num_stops,0.308156,0.308156,state,SC,pearson_corr,-0.0421016,0.042102,1.0,True,False,True,False,0.536082
1082,hit_rate,num_stops,0.441376,0.441376,state,VT,pearson_corr,-0.0421016,0.042102,1.0,True,False,True,False,0.536082
1083,hit_rate,num_stops,0.302604,0.302604,state,WA,pearson_corr,-0.0421016,0.042102,1.0,True,False,True,False,0.536082


In [33]:
labeled_df.add_view_score('SP_thresh0.2',agg_type='sum',colored=False).head(10)

Unnamed: 0,feat1,feat2,subgroup_trend,subgroup_trend_strength,group_feat,subgroup,trend_type,agg_trend,agg_trend_strength,distance,SP_thresh0.2,default_qual_sp,SP,lin_only_qual_sp,mean_view_distance,sum_view_SP_thresh0.2
0,hit_rate,num_stops,0.139426,0.139426,state,AZ,pearson_corr,-0.0421016,0.042102,1.0,True,False,True,False,0.536082,52.0
1,hit_rate,num_stops,0.390466,0.390466,state,CO,pearson_corr,-0.0421016,0.042102,1.0,True,False,True,False,0.536082,52.0
2,hit_rate,num_stops,0.190372,0.190372,state,CT,pearson_corr,-0.0421016,0.042102,1.0,True,False,True,False,0.536082,52.0
3,hit_rate,num_stops,0.297911,0.297911,state,IL,pearson_corr,-0.0421016,0.042102,1.0,True,False,True,False,0.536082,52.0
4,hit_rate,num_stops,0.390729,0.390729,state,MA,pearson_corr,-0.0421016,0.042102,1.0,True,False,True,False,0.536082,52.0
5,hit_rate,num_stops,0.0152053,0.015205,state,MD,pearson_corr,-0.0421016,0.042102,1.0,True,False,True,False,0.536082,52.0
6,hit_rate,num_stops,0.218123,0.218123,state,RI,pearson_corr,-0.0421016,0.042102,1.0,True,False,True,False,0.536082,52.0
7,hit_rate,num_stops,0.308156,0.308156,state,SC,pearson_corr,-0.0421016,0.042102,1.0,True,False,True,False,0.536082,52.0
8,hit_rate,num_stops,0.441376,0.441376,state,VT,pearson_corr,-0.0421016,0.042102,1.0,True,False,True,False,0.536082,52.0
9,hit_rate,num_stops,0.302604,0.302604,state,WA,pearson_corr,-0.0421016,0.042102,1.0,True,False,True,False,0.536082,52.0


In [34]:
labeled_df.rank_occurences_by_view('sum_view_SP_thresh0.2','SP_thresh0.2').head()

Index(['feat1', 'feat2', 'subgroup_trend', 'subgroup_trend_strength',
       'group_feat', 'subgroup', 'trend_type', 'agg_trend',
       'agg_trend_strength', 'distance', 'SP_thresh0.2', 'default_qual_sp',
       'SP', 'lin_only_qual_sp', 'mean_view_distance',
       'sum_view_SP_thresh0.2'],
      dtype='object')


Unnamed: 0,feat1,feat2,subgroup_trend,subgroup_trend_strength,group_feat,subgroup,trend_type,agg_trend,agg_trend_strength,distance,SP_thresh0.2,default_qual_sp,SP,lin_only_qual_sp,mean_view_distance,sum_view_SP_thresh0.2
485,year,search_conducted_rate,0.118238,0.118238,state,CT,pearson_corr,-0.0390297,0.03903,1.0,True,False,True,False,0.365826,100.0
486,year,search_conducted_rate,0.000440801,0.000441,state,SC,pearson_corr,-0.0390297,0.03903,1.0,True,False,True,False,0.365826,100.0
487,year,search_conducted_rate,0.0485728,0.048573,state,WI,pearson_corr,-0.0390297,0.03903,1.0,True,False,True,False,0.365826,100.0
488,year,search_conducted_rate,0.0304108,0.030411,driver_gender,F,pearson_corr,-0.0390297,0.03903,1.0,True,False,True,False,0.365826,100.0
489,year,search_conducted_rate,0.134696,0.134696,driver_race,Other,pearson_corr,-0.0390297,0.03903,1.0,True,False,True,False,0.365826,100.0
