In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm, dummy, feature_selection

# For stackoverflow uncomment following lines
# csz = '18'
# ml_in_file = "/data/khodadaa/ml-exp/stack_feat/ml_in_{}.csv".format(csz)
# ml_out_file = "/data/khodadaa/ml-exp/stack_feat/ml_out_{}_".format(csz)

# For wikipedia uncomment following lines
csz = '2'
# ml_in_file = "/data/khodadaa/ml-exp/wiki/ml_in_{}.csv".format(csz)
ml_in_file = "/data/khodadaa/ml-exp/wiki/ml_in_{}_vahid_features.csv".format(csz)
ml_out_file = "/data/khodadaa/ml-exp/wiki/ml_out_{}_".format(csz)

#For inex uncomment following lines
# csz = '24'
# ml_in_file = "/data/khodadaa/ml-exp/inex/inex_ml_in_{}.csv".format(csz)
# ml_out_file = "/data/khodadaa/ml-exp/inex/inex_ml_out_{}_".format(csz)

In [2]:
from scipy.stats import ttest_rel
def t_test(data1, data2, alpha=0.05):
    # compare samples
    stat, p = ttest_rel(data1, data2)
#     print('Statistics=%.3f, p=%.5f' % (stat, p))
    # interpret    
    if p > alpha:
#         print('Same distributions (fail to reject H0)')
        return True
    else:
#         print('Different distributions (reject H0)')
        return False

        
def evaluation_results(y_true, y_pred, weight):
    tn, fp, fn, tp = metrics.confusion_matrix(y_true, y_pred, sample_weight=weight).ravel() 
#     print("\t  precision = %.3f" % (tp / (tp + fp)))
#     print("\t  recall = %.3f" % (tp / (tp + fn)))
#     print('\t  f1 score = %.3f' %(metrics.f1_score(y_true, y_pred)))
#     print("\t  negative predictive value= %.2f" % (tn / (tn + fn)))
#     print("\t  true negative rate= %.2f" % (tn / (tn + fp)))
#     print("\t  1s percentage = %.2f" % (100 * np.sum(y_pred) / y_pred.shape[0]))
    
    ev = {'acc': (tp+tn)/(tp+tn+fp+fn),
          'prec': tp/(tp+fp),
          'rec': tp/(tp+fn),
          'f1': metrics.f1_score(y_true, y_pred),
          'NPV': tn/(tn+fn),
          'TNR': tn/(tn+fp),          
          '1-ratio': 100*np.sum(y_pred)/y_pred.shape[0]}
    return ev

def rank_features(model, X, y):
    feature_selection.RFE(model, 1)
    # create the RFE model and select 1 attributes    
    rfe = feature_selection.RFE(model, 1)
    rfe = rfe.fit(X, y)        
    return rfe.ranking_
    

def get_mrr(y: pd.Series, rrgrps:dict, weights=None)-> float:
    if weights is None:
        weights = pd.Series(index=y.index, data=1.0)            
    
    (lb1, rrg1), (lb2, rrg2) = list(rrgrps.items())
    y1inx, y2inx = y[y==lb1].index, y[y==lb2].index 
    
    tot_wg = weights.loc[y.index].sum(axis=0)
    summed_rr = (rrg1.loc[y1inx] * weights.loc[y1inx]).sum(axis=0) + \
                (rrg2.loc[y2inx] * weights.loc[y2inx]).sum(axis=0)
    return summed_rr / tot_wg    

# Train/Test

In [3]:
%%time 

TEST_SIZE = 0.33
THREASHOLD = None
FEATURES = None
# FEATURES = ['ql_t_sub', 'ql_t_cmp']

results = pd.DataFrame(columns=['features', '#feat', '|test|', '%badQ','thres', 'acc-tr',
                                'acc', 'prec', 'rec', 'f1', 'TNR', 'NPV', '1-ratio',
                                'mrr', 'mrr-max', 'mrr-bad', 'mrr-good'])

def add_nonlinear_features(df):    
    cols_sub = [c for c in df.columns if c[-4:] == '_sub']
    cols_cmp = [c for c in df.columns if c[-4:] == '_cmp']    
    for i, _ in enumerate(cols_sub):    
        df[cols_sub[i]+'/'+cols_cmp[i]] = df[cols_sub[i]]/(df[cols_cmp[i]]+0.00000001)
    return df


def build_model(tr_X, tr_y, mod, cw=None, sw=None):        
    if mod in ['dum-u', 'dum-f', 'dum-s']:        
        strag = {'dum-u': 'uniform', 'dum-f': 'most_frequent', 'dum-s': 'stratified'}        
        clf = dummy.DummyClassifier(strategy=strag[mod], random_state=1)
    if mod in ['dum-0', 'dum-1']:
        clf = dummy.DummyClassifier(strategy='constant', constant=int(mod[-1]))
    if mod == 'log':
        clf = linear_model.LogisticRegression(class_weight=cw, random_state=1)
        
    clf.fit(tr_X, tr_y, sample_weight=sw)
    return clf


in_df = pd.read_csv(ml_in_file)
y = in_df['Y'].copy()
X = in_df[in_df.columns.difference(['Query', 'Y', 'rr_al', 'rr_sb', 'TestViewCount'])].copy()
bad_ix = in_df[in_df['rr_al'] > in_df['rr_sb']].index
good_ix = in_df[in_df['rr_al'] <= in_df['rr_sb']].index

le = preprocessing.LabelEncoder()
y = pd.Series(data=le.fit_transform(y), index=y.index)

X = add_nonlinear_features(X)
if FEATURES:
    X = X.filter(FEATURES)

feat = list(X.columns)
print('Features:\n%s' % (feat))

train_x, test_x, train_y, test_y = model_selection.train_test_split(X, y, stratify=y, \
                                                                    test_size=TEST_SIZE, random_state=5)
sc = preprocessing.MinMaxScaler()
train_x = sc.fit_transform(train_x)
test_x = sc.transform(test_x)


def get_sample_weights(mod):
    w = in_df['TestViewCount'].copy()        
    
    if mod == 'swbc':
        tr0inx, tr1inx = train_y[train_y==0].index, train_y[train_y==1].index
        ts0inx, ts1inx = test_y[test_y==0].index, test_y[test_y==1].index
        tr0s, tr1s = w[tr0inx].sum(axis=0), w[tr1inx].sum(axis=0)
        ts0s, ts1s = w[ts0inx].sum(axis=0), w[ts1inx].sum(axis=0)
        w[tr0inx] /= tr0s
        w[tr1inx] /= tr1s
        w[ts0inx] /= ts0s
        w[ts1inx] /= ts1s
        
    if mod == 'swAr':
        tr0inx, tr1inx = train_y[train_y==0].index, train_y[train_y==1].index
        tr0s, tr1s = w[tr0inx].sum(axis=0), w[tr1inx].sum(axis=0)
        w[tr0inx] /= tr0s
        w[tr1inx] /= tr1s
    
    if mod == 'swnc':
        s0, s1 = w[y==0].sum(axis=0), w[y==1].sum(axis=0)    
        w[y==0] /= s0
        w[y==1] /= s1    
    
    return w.loc[train_y.index], w.loc[test_y.index]

pred_df = pd.DataFrame(index=test_y.index)
pred_df['Query'] = in_df.loc[test_y.index, 'Query']
pred_df['TestViewCount'] = in_df.loc[test_y.index, 'TestViewCount']
pred_df['rr_al'] = in_df.loc[test_y.index, 'rr_al']
pred_df['rr_sb'] = in_df.loc[test_y.index, 'rr_sb']
pred_df['true_y'] = test_y

for mde in ['dum-u', 'dum-f', 'dum-s', 'dum-0', 'dum-1', 'ql', 'log', 'log-bal',
            'dum-u-swvc', 'dum-s-swvc', 'dum-0-swvc', 'dum-1-swvc', 'ql-swvc', 'log-swvc',             
            'dum-u-swbc', 'dum-s-swbc', 'dum-0-swbc', 'dum-1-swbc', 'ql-swbc', 'log-swbc',            
            'dum-u-swAr', 'dum-s-swAr', 'dum-0-swAr', 'dum-1-swAr', 'ql-swAr', 'log-swAr',
            'dum-u-swnc', 'dum-s-swnc', 'dum-0-swnc', 'dum-1-swnc', 'ql-swnc', 'log-swnc',
            'log-bal-swvc', 'log-bal-swbc', 'log-bal-swAr', 'log-bal-swnc']:
    m = mde
    clf = None
    clweight = None
    fea_cnt = 0
    print('-------------------------------------------')
    print(m + " classifier ..")    
    train_weights, test_weights = None, None
    # set weight
    if m[-5:] in ['-swvc', '-swnc', '-swbc', '-swAr']:        
        train_weights, test_weights = get_sample_weights(m[-4:])
        # train without sample_weights, test with sample_weights
        if m in ['log-bal-swvc', 'log-bal-swbc', 'log-bal-swAr', 'log-bal-swnc']:
            train_weights = None
        m = m[:-5]
    # set balanced labels
    if m[-4:] == '-bal':
        clweight = 'balanced'
        m = m[:-4]
    
    if m[:2] == 'ql':
        X_ql = X.loc[test_y.index, ['ql_t_sub', 'ql_t_cmp']]
        pred_y = np.where(X_ql['ql_t_sub'] >= X_ql['ql_t_cmp'], 'sub', 'all')
        pred_y = le.transform(pred_y)
        fea_cnt = 2
    else:
        clf = build_model(train_x, train_y, mod=m, sw=train_weights, cw=clweight)        
        pred_y = clf.predict(test_x)
        fea_cnt = len(feat)
    pred_y = pd.Series(data=pred_y, index=test_y.index)
    
    res = {'features': feat, '#feat': fea_cnt,  '|test|': TEST_SIZE, 'thres': THREASHOLD,
           'acc-tr': None if  clf is None else clf.score(train_x, train_y, train_weights)}
    res.update(evaluation_results(test_y, pred_y, test_weights))
    
    subL, allL = le.transform(['sub', 'all'])
    rrg = {subL: in_df['rr_sb'], allL: in_df['rr_al']}
    mrr = get_mrr(pred_y, rrg, weights=test_weights)
    mrr_mx = get_mrr(test_y, rrg, weights=test_weights)    
    res.update({'mrr': mrr, 'mrr-max': mrr_mx})
    
    # bad queries
    bad_test_ix = test_y.index.intersection(bad_ix)
    res['%badQ'] = (bad_test_ix.shape[0] / test_y.shape[0]) * 100.0
    bad_rrg = {subL: in_df.loc[bad_test_ix, 'rr_sb'], allL: in_df.loc[bad_test_ix, 'rr_al']}    
    bad_mrr = get_mrr(pred_y.loc[bad_test_ix], bad_rrg, weights=test_weights)
    res['mrr-bad'] = bad_mrr
    
    # good queries
    good_test_ix = test_y.index.intersection(good_ix)
    good_rrg = {subL: in_df.loc[good_test_ix, 'rr_sb'], allL: in_df.loc[good_test_ix, 'rr_al']}    
    good_mrr = get_mrr(pred_y.loc[good_test_ix], good_rrg, weights=test_weights)
    res['mrr-good'] = good_mrr
    
    # rank features for logisitic regression
    if mde == 'log-bal-swvc':        
        feat_rankings = rank_features(clf, train_x, train_y)
        feat_rankings = list(zip(feat_rankings, feat))
        feat_rankings.sort()
        res['features'] = str(feat_rankings)
        
    
    results.loc[mde, :] = res
    pred_df[mde] = pred_y

results.to_csv(ml_out_file+'{}_evals.csv'.format(len(feat)))
pred_df.to_csv(ml_out_file+'{}_predicts.csv'.format(len(feat)), index=False)

Features:
['covered_c_bi_cmp', 'covered_c_bi_sub', 'covered_c_cmp', 'covered_c_sub', 'covered_t_bi_cmp', 'covered_t_bi_sub', 'covered_t_cmp', 'covered_t_sub', 'mean_df_c_bi_cmp', 'mean_df_c_bi_sub', 'mean_df_c_cmp', 'mean_df_c_sub', 'mean_df_t_bi_cmp', 'mean_df_t_bi_sub', 'mean_df_t_cmp', 'mean_df_t_sub', 'mean_mean_pop_c_bi_cmp', 'mean_mean_pop_c_bi_sub', 'mean_mean_pop_c_cmp', 'mean_mean_pop_c_sub', 'mean_mean_pop_t_bi_cmp', 'mean_mean_pop_t_bi_sub', 'mean_mean_pop_t_cmp', 'mean_mean_pop_t_sub', 'mean_min_pop_c_bi_cmp', 'mean_min_pop_c_bi_sub', 'mean_min_pop_c_cmp', 'mean_min_pop_c_sub', 'mean_min_pop_t_bi_cmp', 'mean_min_pop_t_bi_sub', 'mean_min_pop_t_cmp', 'mean_min_pop_t_sub', 'min_df_c_bi_cmp', 'min_df_c_bi_sub', 'min_df_c_cmp', 'min_df_c_sub', 'min_df_t_bi_cmp', 'min_df_t_bi_sub', 'min_df_t_cmp', 'min_df_t_sub', 'min_mean_pop_c_bi_cmp', 'min_mean_pop_c_bi_sub', 'min_mean_pop_c_cmp', 'min_mean_pop_c_sub', 'min_mean_pop_t_bi_cmp', 'min_mean_pop_t_bi_sub', 'min_mean_pop_t_cmp', 'mi

  'precision', 'predicted', average, warn_for)


-------------------------------------------
log-bal classifier ..
-------------------------------------------
dum-u-swvc classifier ..
-------------------------------------------
dum-s-swvc classifier ..
-------------------------------------------
dum-0-swvc classifier ..
-------------------------------------------
dum-1-swvc classifier ..
-------------------------------------------
ql-swvc classifier ..
-------------------------------------------
log-swvc classifier ..
-------------------------------------------
dum-u-swbc classifier ..
-------------------------------------------
dum-s-swbc classifier ..
-------------------------------------------
dum-0-swbc classifier ..
-------------------------------------------
dum-1-swbc classifier ..
-------------------------------------------
ql-swbc classifier ..
-------------------------------------------
log-swbc classifier ..
-------------------------------------------
dum-u-swAr classifier ..




-------------------------------------------
dum-s-swAr classifier ..
-------------------------------------------
dum-0-swAr classifier ..
-------------------------------------------
dum-1-swAr classifier ..
-------------------------------------------
ql-swAr classifier ..
-------------------------------------------
log-swAr classifier ..
-------------------------------------------
dum-u-swnc classifier ..
-------------------------------------------
dum-s-swnc classifier ..
-------------------------------------------
dum-0-swnc classifier ..
-------------------------------------------
dum-1-swnc classifier ..
-------------------------------------------
ql-swnc classifier ..


  'precision', 'predicted', average, warn_for)


-------------------------------------------
log-swnc classifier ..
-------------------------------------------
log-bal-swvc classifier ..
-------------------------------------------
log-bal-swbc classifier ..
-------------------------------------------
log-bal-swAr classifier ..
-------------------------------------------
log-bal-swnc classifier ..
CPU times: user 1min 4s, sys: 2.18 s, total: 1min 6s
Wall time: 9.36 s


In [4]:
test_names = list(pred_df.columns.difference(['Query', 'TestViewCount', 'rr_al', 'rr_sb']))
for i, m1 in enumerate(test_names):
    for m2 in test_names[i+1:]:
        if t_test(pred_df[m1], pred_df[m2]):
            print(m1, m2, 'Same distributions')

dum-s true_y Same distributions
dum-s-swAr dum-s-swnc Same distributions
dum-s-swAr dum-u Same distributions
dum-s-swAr dum-u-swAr Same distributions
dum-s-swAr dum-u-swbc Same distributions
dum-s-swAr dum-u-swnc Same distributions
dum-s-swAr dum-u-swvc Same distributions
dum-s-swbc dum-s-swnc Same distributions
dum-s-swbc dum-u Same distributions
dum-s-swbc dum-u-swAr Same distributions
dum-s-swbc dum-u-swbc Same distributions
dum-s-swbc dum-u-swnc Same distributions
dum-s-swbc dum-u-swvc Same distributions
dum-s-swnc dum-u Same distributions
dum-s-swnc dum-u-swAr Same distributions
dum-s-swnc dum-u-swbc Same distributions
dum-s-swnc dum-u-swnc Same distributions
dum-s-swnc dum-u-swvc Same distributions


In [5]:
results.loc[['dum-0-swvc', 'dum-1-swvc', 'dum-u-swvc', 'ql-swvc', 'log-bal-swvc'], ['%badQ','mrr-good', 'mrr-bad', 'mrr']]

Unnamed: 0,%badQ,mrr-good,mrr-bad,mrr
dum-0-swvc,6.85216,0.226356,0.455406,0.239463
dum-1-swvc,6.85216,0.707002,0.0253723,0.667997
dum-u-swvc,6.85216,0.470672,0.249122,0.457994
ql-swvc,6.85216,0.658308,0.18329,0.631126
log-bal-swvc,6.85216,0.636354,0.298085,0.616997


In [6]:
results.loc['log-bal-swvc', 'features']

"[(1, 'ql_t_bi_sub/ql_t_bi_cmp'), (2, 'min_mean_pop_t_bi_sub'), (3, 'qll_t_sub/qll_t_cmp'), (4, 'mean_mean_pop_t_bi_cmp'), (5, 'min_min_pop_t_bi_sub'), (6, 'min_mean_pop_t_sub'), (7, 'min_mean_pop_t_cmp'), (8, 'ql_c_bi_sub/ql_c_bi_cmp'), (9, 'mean_min_pop_t_sub'), (10, 'mean_df_c_cmp'), (11, 'mean_df_c_sub'), (12, 'min_df_t_bi_sub/min_df_t_bi_cmp'), (13, 'covered_c_bi_sub/covered_c_bi_cmp'), (14, 'min_df_c_cmp'), (15, 'covered_t_bi_sub'), (16, 'min_mean_pop_t_bi_cmp'), (17, 'min_min_pop_t_bi_cmp'), (18, 'mean_min_pop_t_bi_sub'), (19, 'ql_t_sub'), (20, 'min_df_t_cmp'), (21, 'mean_mean_pop_c_bi_sub'), (22, 'qll_t_bi_sub'), (23, 'qll_c_cmp'), (24, 'mean_min_pop_c_sub/mean_min_pop_c_cmp'), (25, 'min_df_c_bi_sub/min_df_c_bi_cmp'), (26, 'ql_c_sub/ql_c_cmp'), (27, 'mean_df_t_bi_sub/mean_df_t_bi_cmp'), (28, 'min_min_pop_t_bi_sub/min_min_pop_t_bi_cmp'), (29, 'mean_df_t_sub'), (30, 'mean_df_t_cmp'), (31, 'min_min_pop_t_sub'), (32, 'mean_min_pop_c_bi_sub'), (33, 'qll_c_sub/qll_c_cmp'), (34, 'cove

# Tips

In [7]:
# add a new indexed row to a Dataframe from a dictionary 
df = pd.DataFrame(columns=['A', 'B', 'C'], dtype=int)
df.loc['log', :] = {'A':1, 'B':10}
df.loc['dum', :] = {'A':2, 'C':20}
df

Unnamed: 0,A,B,C
log,1,10.0,
dum,2,,20.0


In [8]:
# combine two dictionaries
d1 = {'A':1, 'C':3}
d2 = {'B':2, 'A':4}
d1.update(d2)
d1

{'A': 4, 'C': 3, 'B': 2}