In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm, dummy

# For stackoverflow uncomment following lines
csz = '18'
ml_in_file = "/data/khodadaa/stack_results/stack_feat/ml_in_{}.csv".format(csz)
ml_out_file = "/data/khodadaa/stack_results/stack_feat/ml_out_{}_".format(csz)

# For wikipedia uncomment following lines
# csz = '2'
# ml_in_file = "/data/khodadaa/stack_results/wiki/ml_in_{}.csv".format(csz)
# ml_out_file = "/data/khodadaa/stack_results/wiki/ml_out_{}_".format(csz)

In [2]:
from scipy.stats import ttest_rel
def t_test(data1, data2, alpha=0.05):
    # compare samples
    stat, p = ttest_rel(data1, data2)
    print('Statistics=%.3f, p=%.5f' % (stat, p))
    # interpret    
    if p > alpha:
        print('Same distributions (fail to reject H0)')
    else:
        print('Different distributions (reject H0)')

        
def evaluation_results(y_true, y_pred, weight):    
    tn, fp, fn, tp = metrics.confusion_matrix(y_true, y_pred, sample_weight=weight).ravel()          
#     print("\t  precision = %.3f" % (tp / (tp + fp)))
#     print("\t  recall = %.3f" % (tp / (tp + fn)))
#     print('\t  f1 score = %.3f' %(metrics.f1_score(y_true, y_pred)))
#     print("\t  negative predictive value= %.2f" % (tn / (tn + fn)))
#     print("\t  true negative rate= %.2f" % (tn / (tn + fp)))
#     print("\t  1s percentage = %.2f" % (100 * np.sum(y_pred) / y_pred.shape[0]))
    
    ev = {'acc': (tp+tn)/(tp+tn+fp+fn),
          'prec': tp/(tp+fp),
          'rec': tp/(tp+fn),
          'f1': metrics.f1_score(y_true, y_pred),
          'NPV': tn/(tn+fn),
          'TNR': tn/(tn+fp),          
          '1-ratio': 100*np.sum(y_pred)/y_pred.shape[0]}
    return ev

def get_mrr(y: pd.Series, rrgrps:dict, weights=None)-> float:
    if weights is None:
        weights = pd.Series(index=y.index, data=1.0)            
    
    (lb1, rrg1), (lb2, rrg2) = list(rrgrps.items())
    y1inx, y2inx = y[y==lb1].index, y[y==lb2].index 
    
    tot_wg = weights.loc[y.index].sum(axis=0)
    summed_rr = (rrg1.loc[y1inx] * weights.loc[y1inx]).sum(axis=0) + \
                (rrg2.loc[y2inx] * weights.loc[y2inx]).sum(axis=0)
    return summed_rr / tot_wg    

# Train/Test

In [3]:
%%time 

TEST_SIZE = 0.33
THREASHOLD = None
FEATURES = None
# FEATURES = ['ql_t_sub', 'ql_t_cmp']

results = pd.DataFrame(columns=['features', '#feat', '|test|', 'thres', 'acc-tr', 'acc', 'prec', 'rec', 'f1', \
                                'TNR', 'NPV', '1-ratio', 'mrr-unq', 'mrr-unq-max', 'mrr', 'mrr-max'])

def add_nonlinear_features(df):    
    cols_sub = [c for c in df.columns if c[-4:] == '_sub']
    cols_cmp = [c for c in df.columns if c[-4:] == '_cmp']    
    for i, _ in enumerate(cols_sub):    
        df[cols_sub[i]+'/'+cols_cmp[i]] = df[cols_sub[i]]/(df[cols_cmp[i]]+0.00000001)
    return df


def build_model(tr_X, tr_y, mod, cw=None, sw=None):        
    if mod in ['dum-u', 'dum-f', 'dum-s']:
        strag = {'dum-u': 'uniform', 'dum-f': 'most_frequent', 'dum-s': 'stratified'}        
        clf = dummy.DummyClassifier(strategy=strag[mod], random_state=1)
    if mod == 'log':
        clf = linear_model.LogisticRegression(class_weight=cw, random_state=1)
        
    clf.fit(tr_X, tr_y, sample_weight=sw)
    return clf


in_df = pd.read_csv(ml_in_file)
y = in_df['Y'].copy()
X = in_df[in_df.columns.difference(['Query', 'Y', 'rr_al', 'rr_sb', 'TestViewCount'])].copy()

le = preprocessing.LabelEncoder()
y = pd.Series(data=le.fit_transform(y), index=y.index)

X = add_nonlinear_features(X)
if FEATURES:
    X = X.filter(FEATURES)

feat = list(X.columns)
print('Features:\n%s' % (feat))

train_x, test_x, train_y, test_y = model_selection.train_test_split(X, y, stratify=y, \
                                                                    test_size=TEST_SIZE, random_state=5)
sc = preprocessing.MinMaxScaler()
train_x = sc.fit_transform(train_x)
test_x = sc.fit_transform(test_x)

def get_sample_weights(mod):
    w = in_df['TestViewCount'].copy()        
    
    if mod == 'swnc':
        s0, s1 = w[y==0].sum(axis=0), w[y==1].sum(axis=0)    
        w[y==0] /= s0
        w[y==1] /= s1    
    
    if mod == 'swbc':
        tr0inx, tr1inx = train_y[train_y==0].index, train_y[train_y==1].index
        ts0inx, ts1inx = test_y[test_y==0].index, test_y[test_y==1].index
        tr0s, tr1s = w[tr0inx].sum(axis=0), w[tr1inx].sum(axis=0)
        ts0s, ts1s = w[ts0inx].sum(axis=0), w[ts1inx].sum(axis=0)
        w[tr0inx] /= tr0s
        w[tr1inx] /= tr1s
        w[ts0inx] /= ts0s
        w[ts1inx] /= ts1s
    
    return w.loc[train_y.index], w.loc[test_y.index]

pred_df = pd.DataFrame(index=test_y.index)
pred_df['TestViewCount'] = in_df.loc[test_y.index, 'TestViewCount']
pred_df['true_y'] = test_y

for mde in ['dum-u', 'dum-f', 'dum-s', 'ql', 'log', 'log-bal',
            'dum-u-swvc', 'dum-f-swvc', 'ql-swvc', 'log-swvc', 
#             'dum-u-swnc', 'dum-f-swnc', 'ql-swnc', 'log-swnc', 
            'dum-u-swbc', 'dum-f-swbc', 'ql-swbc', 'log-swbc']:
    m = mde
    clf = None
    clweight = None
    fea_cnt = 0
    print('-------------------------------------------')
    print(m + " classifier ..")    
    train_weights, test_weights = None, None
    # set weight
    if m[-5:] in ['-swvc', '-swnc', '-swbc']:        
        train_weights, test_weights = get_sample_weights(m[-4:])
        m = m[:-5]
    # set balanced labels
    if m[-4:] == '-bal':
        clweight = 'balanced'
        m = m[:-4]
    
    if m[:2] == 'ql':
        X_ql = X.loc[test_y.index, ['ql_t_sub', 'ql_t_cmp']]
        pred_y = np.where(X_ql['ql_t_sub'] >= X_ql['ql_t_cmp'], 'sub', 'all')
        pred_y = le.transform(pred_y)
        fea_cnt = 2
    else:
        clf = build_model(train_x, train_y, mod=m, sw=train_weights, cw=clweight)        
        pred_y = clf.predict(test_x)
        fea_cnt = len(feat)
    pred_y = pd.Series(data=pred_y, index=test_y.index)
    
    res = {'features': feat, '#feat': fea_cnt,  '|test|': TEST_SIZE, 'thres': THREASHOLD,
           'acc-tr': None if  clf is None else clf.score(train_x, train_y, train_weights)}
    res.update(evaluation_results(test_y, pred_y, test_weights))
    
    subL, allL = le.transform(['sub', 'all'])
    rrg = {subL: in_df['rr_sb'], allL: in_df['rr_al']}
    mrr_unq = get_mrr(pred_y, rrg)
    mrr_unq_mx = get_mrr(test_y, rrg)
    mrr = get_mrr(pred_y, rrg, weights=test_weights)
    mrr_mx = get_mrr(test_y, rrg, weights=test_weights)
    
    res.update({'mrr-unq': mrr_unq, 'mrr-unq-max': mrr_unq_mx, 'mrr': mrr, 'mrr-max': mrr_mx})
    results.loc[mde, :] = res
    pred_df[mde] = pred_y

results.to_csv(ml_out_file+'{}_evals.csv'.format(len(feat)))
pred_df.to_csv(ml_out_file+'{}_predicts.csv'.format(len(feat)))

Features:
['covered_t_bi_cmp', 'covered_t_bi_sub', 'covered_t_cmp', 'covered_t_sub', 'maxSCQ_t_cmp', 'maxSCQ_t_sub', 'mean_df_t_bi_cmp', 'mean_df_t_bi_sub', 'mean_df_t_cmp', 'mean_df_t_sub', 'mean_mean_pop_t_bi_cmp', 'mean_mean_pop_t_bi_sub', 'mean_mean_pop_t_cmp', 'mean_mean_pop_t_sub', 'mean_min_pop_t_bi_cmp', 'mean_min_pop_t_bi_sub', 'mean_min_pop_t_cmp', 'mean_min_pop_t_sub', 'min_df_t_bi_cmp', 'min_df_t_bi_sub', 'min_df_t_cmp', 'min_df_t_sub', 'min_mean_pop_t_bi_cmp', 'min_mean_pop_t_bi_sub', 'min_mean_pop_t_cmp', 'min_mean_pop_t_sub', 'min_min_pop_t_bi_cmp', 'min_min_pop_t_bi_sub', 'min_min_pop_t_cmp', 'min_min_pop_t_sub', 'ql_t_bi_cmp', 'ql_t_bi_sub', 'ql_t_cmp', 'ql_t_sub', 'qll_t_bi_cmp', 'qll_t_bi_sub', 'qll_t_cmp', 'qll_t_sub', 'scs_t_cmp', 'scs_t_sub', 'covered_t_bi_sub/covered_t_bi_cmp', 'covered_t_sub/covered_t_cmp', 'maxSCQ_t_sub/maxSCQ_t_cmp', 'mean_df_t_bi_sub/mean_df_t_bi_cmp', 'mean_df_t_sub/mean_df_t_cmp', 'mean_mean_pop_t_bi_sub/mean_mean_pop_t_bi_cmp', 'mean_mean_

  'precision', 'predicted', average, warn_for)


-------------------------------------------
dum-s classifier ..
-------------------------------------------
ql classifier ..
-------------------------------------------
log classifier ..
-------------------------------------------
log-bal classifier ..
-------------------------------------------
dum-u-swvc classifier ..
-------------------------------------------
dum-f-swvc classifier ..




-------------------------------------------
ql-swvc classifier ..
-------------------------------------------
log-swvc classifier ..
-------------------------------------------
dum-u-swbc classifier ..
-------------------------------------------
dum-f-swbc classifier ..




-------------------------------------------
ql-swbc classifier ..
-------------------------------------------
log-swbc classifier ..
CPU times: user 16min 21s, sys: 45.8 s, total: 17min 7s
Wall time: 3min 38s


In [4]:
results

Unnamed: 0,features,#feat,|test|,thres,acc-tr,acc,prec,rec,f1,TNR,NPV,1-ratio,mrr-unq,mrr-unq-max,mrr,mrr-max
dum-u,"[covered_t_bi_cmp, covered_t_bi_sub, covered_t...",60,0.33,,0.500661,0.499664,0.179691,0.499068,0.264241,0.499795,0.819637,50.0,0.292357,0.504836,0.292357,0.504836
dum-f,"[covered_t_bi_cmp, covered_t_bi_sub, covered_t...",60,0.33,,0.819971,0.819973,,0.0,0.0,1.0,0.819973,0.0,0.466661,0.504836,0.466661,0.504836
dum-s,"[covered_t_bi_cmp, covered_t_bi_sub, covered_t...",60,0.33,,0.705321,0.705754,0.181492,0.180762,0.181126,0.821018,0.820293,17.9303,0.404684,0.504836,0.404684,0.504836
ql,"[covered_t_bi_cmp, covered_t_bi_sub, covered_t...",2,0.33,,,0.542026,0.244539,0.738953,0.367471,0.49879,0.896937,54.401,0.306173,0.504836,0.306173,0.504836
log,"[covered_t_bi_cmp, covered_t_bi_sub, covered_t...",60,0.33,,0.821894,0.821405,0.639159,0.0182642,0.0355136,0.997736,0.822347,0.514434,0.466235,0.504836,0.466235,0.504836
log-bal,"[covered_t_bi_cmp, covered_t_bi_sub, covered_t...",60,0.33,,0.625675,0.789912,0.379553,0.263097,0.310773,0.905575,0.848422,12.4791,0.441038,0.504836,0.441038,0.504836
dum-u-swvc,"[covered_t_bi_cmp, covered_t_bi_sub, covered_t...",60,0.33,,0.498285,0.506372,0.837277,0.507939,0.264241,0.498409,0.166212,50.0,0.292357,0.504836,0.477785,0.605419
dum-f-swvc,"[covered_t_bi_cmp, covered_t_bi_sub, covered_t...",60,0.33,,0.8364,0.835557,0.835557,1.0,0.305123,0.0,,100.0,0.118841,0.504836,0.531307,0.605419
ql-swvc,"[covered_t_bi_cmp, covered_t_bi_sub, covered_t...",2,0.33,,,0.729093,0.87626,0.786897,0.367471,0.43538,0.286777,54.401,0.306173,0.504836,0.517979,0.605419
log-swvc,"[covered_t_bi_cmp, covered_t_bi_sub, covered_t...",60,0.33,,0.846385,0.818759,0.874994,0.913613,0.355364,0.336795,0.434161,67.2885,0.25864,0.504836,0.539508,0.605419


# Tips

In [5]:
# add a new indexed row to a Dataframe from a dictionary 
df = pd.DataFrame(columns=['A', 'B', 'C'], dtype=int)
df.loc['log', :] = {'A':1, 'B':10}
df.loc['dum', :] = {'A':2, 'C':20}
df

Unnamed: 0,A,B,C
log,1,10.0,
dum,2,,20.0


In [6]:
# combine two dictionaries
d1 = {'A':1, 'C':3}
d2 = {'B':2, 'A':4}
d1.update(d2)
d1

{'A': 4, 'C': 3, 'B': 2}