In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import glob
from os.path import expanduser

from sklearn.cross_validation import train_test_split
import itertools
from statsmodels.tsa import stattools

from sgmtradingcore.exchange.ticks import MatchbookTicks, BetfairTicks, BetdaqTicks

from dev_analysis.market.get_data import *
from dev_analysis.market.analyse_data import *

pd.set_option('display.max_columns', None)

%load_ext autoreload
%autoreload 2

ImportError: No module named dev_analysis.market.get_data

In [2]:
#df_list, fname_list = get_clean_df_list('/home/ioanna/.oddscache/', 30, has_date_folders = False)




In [3]:
main_price_ticks = ['bp1','mipA', 'mipB', 'mdp', 'lp1']
aux_prices_ticks = ['bp2', 'bp3', 'lp2', 'lp3']
all_prices_ticks = main_price_ticks + aux_prices_ticks

In [18]:
#get data for 200 stickers and clean up (only in-play, only matches with >10000 max matched volume, remove suspensions)

def get_df_list(sticker_dir, sticker_no, inplay_only = True):
    
    stickers = glob.glob(os.path.join(sticker_dir, 'odds_T-EENP*'))
    fname_list = stickers[0:sticker_no]
    
    df_list = []
    
    for fname in fname_list:
        df = pd.read_csv(fname)
        
        if inplay_only == True:
            df = df[df['delay']>0]
         
        #remove data with -1 prices
        #gt = ((df['bp1'] != -1) & (df['bp2'] != -1) & (df['bp3'] != -1) & \
        #        (df['lp1'] != -1) & (df['lp2'] != -1) & (df['lp3'] != -1))
        gt = (df['bp1'] != -1) & (df['lp1'] != -1)
        df = df[gt]
        
        #only use data after certain amount has been matched
        df = df[df['total'] > 1000]
        
        if (not df.empty) & (np.max(df['total']) > 10000):
            df_list.append(df)
            #dt = np.concatenate([np.zeros(1), np.diff(df['timestamp']/1000)])
            #idx = np.where(dt ==  np.max(dt))[0][0]
            #print df.iloc[idx-1:idx+2,:]
        
    return df_list, fname_list

df_list, fname_list = get_df_list('../../media/ak/E3E1-EE52/.oddscache/', 200)



In [5]:
#function tocreate midprice, microprice and book pressure data

def create_price_moves(df_list, fname_list, all_prices_ticks, wghts = [1, 0.3, 0.2]):

    df_list_new = []

    for df, fname in zip(df_list, fname_list):

        #create df with timedelta index
        df = df.reset_index()
        tmdelta = pd.to_datetime(df['timestamp'].values, unit='ms')
        df['timedelta'] = tmdelta
        df.set_index(keys='timedelta', inplace = True)

        #convert prices to ticks
        if 'BF' in fname:
            _ticks = BetfairTicks()
        elif 'MB' in fname:
            _ticks = MatchbookTicks()
        elif 'BD' in fname:
            _ticks = BetdaqTicks()
        ps = ['bp1', 'bp2', 'bp3', 'lp1', 'lp2', 'lp3']
        for p in ps:
            df[p] = df[p].apply(lambda x: _ticks.convert_to(x))

        #set -1 volumes to 0
        vs = ['bv1', 'bv2', 'bv3', 'lv1', 'lv2', 'lv3']
        for v in vs:
            df[v][df[v]<0] = 0


        #create spread
        df['sp_ticks'] = df['lp1'] - df['bp1']
        df = df[df['sp_ticks'] < 50]


        #create midprice column
        df['mdp'] = (df['bp1'] + df['lp1'])/2


        #create book pressure (need to add weights)
        bv = wghts[0]*df['bv1'] + wghts[1]*df['bv2'] + wghts[2]*df['bv3']
        lv = wghts[0]*df['lv1'] + wghts[1]*df['lv2'] + wghts[2]*df['lv3']
        df['bp'] = (bv - lv)/(bv + lv)
        
        #for median
        df['bp_med'] = (bv - lv)/(bv + lv)
        
        #bp change
        df['bp_move'] = df['bp'].diff().fillna(0)

        #create microprice column
        df['mipB'] = (df['bp1']*df['lv1'] + df['lp1']*df['bv1'])/(df['lv1'] + df['bv1'])
        df['mipA'] = (df['bp1']*df['bv1'] + df['lp1']*df['lv1'])/(df['lv1'] + df['bv1'])

        for price_ticks in all_prices_ticks:
            #price_ticks = 'LP3_ticks'
            price_ticks_move = price_ticks + '_move'

            # add microprice move in ticks
            df[price_ticks_move] = df[price_ticks].diff().fillna(0)
            #df['price_ticks_move_name'] = price_ticks_move

        df_list_new.append(df)
        
    return df_list_new



In [9]:
#function to resample data to given time interval

def resample_df(df_list_new, time_sample = '5S'):
  
    op_price = np.sum
    price_cols = [price_tick + '_move' for price_tick in all_prices_ticks]#[price_ticks_move]
    how_resample = {col: op_price for col in price_cols}
    
    op_price = np.mean
    price_cols = [price_tick for price_tick in all_prices_ticks]#[price_ticks_move]
    how_resample.update({col: op_price for col in price_cols})

    op_signal = np.median
    signal_cols = ['bp_med']
    how_resample.update({col: op_signal for col in signal_cols})
    
    op_signal = np.mean
    tick_cols = ['sp_ticks', 'bp_move', 'bp']
    how_resample.update({col: op_signal for col in tick_cols})

    #how_resample.update({'price_ticks_move_name' : 'first'})

    # resample each df
    df_list_resample = []
    for df in df_list_new:
        df_list_resample.append(df.resample(time_sample).agg(how_resample))

    data_resample = pd.concat(df_list_resample, axis=0)
    
    #remove nans
    data_resample = data_resample[pd.notnull(data_resample['bp'])]
    
    return data_resample


In [11]:
df_list

[]

### We basically have 3 parameters to play around with:

a) **weights** used to calculate book pressure. We assume that level 1 imbalance is likely to be a better indicator than level 2, etc, and we explore a combination of weights that satisfies w1>w2>w3.

b) **time step** to resample dataframe to. We explore values between 1s and 20s.

c) **book pressure shift**. After we resample our dataframe, we end up with time intervals of x seconds and the corresponding book pressure and change in midprice(/microprice) values for each interval compared to the last one. We want to see how well bp predicts price shift in subsequent time intervals, so we shift the bp column by n rows down before doing regression. we try n=0,1,2,3.

In [7]:
#sample plotting for a choice of BP weights, time interval, and BP shift

time_sample = '2S'

df_list_new = create_price_moves(df_list, fname_list, all_prices_ticks, wghts = [1, 0.3, 0.2])
data_resample = resample_df(df_list_new, time_sample = time_sample)


#plot stuff

x = 'bp'
bp_bins = np.arange(-1, 1, 0.15)
colors = ['b', 'r', 'g', 'm']

iis = range(0,4)

f, ax = plt.subplots(figsize=(8,5))
for i in iis:
    sns.regplot(data_resample[x].shift(i), data_resample['mdp_move'], order=3, x_bins = bp_bins, 
          scatter=True, truncate=True, ci=66, n_boot=100, color=colors[i], label='bp ' + str(i) + ' timesteps before')
ax.legend()
sns.plt.title('Price movements within time interval of %s'%time_sample)

f, ax = plt.subplots(figsize=(8,5))
for i in range(0,4):
    sns.regplot(data_resample[x].shift(i), data_resample['mipA_move'], order=3, x_bins = bp_bins, 
          scatter=True, truncate=True, ci=66, n_boot=100, color=colors[i], label='bp ' + str(i) + ' timesteps before')
ax.legend()

    
f, ax = plt.subplots(figsize=(8,5))
for i in range(0,4):
    sns.regplot(data_resample[x].shift(i), data_resample['mipB_move'], order=3, x_bins = bp_bins, 
          scatter=True, truncate=True, ci=66, n_boot=100, color=colors[i], label='bp ' + str(i) + ' timesteps before')
ax.legend()





ValueError: All objects passed were None

In [None]:
#granger causality

stattools.grangercausalitytests(data_resample[['bp', 'mdp']], maxlag = 2);
stattools.grangercausalitytests(data_resample[['bp', 'mipA']], maxlag = 2);
stattools.grangercausalitytests(data_resample[['bp', 'mipB']], maxlag = 2);



In [None]:
#fit both linear and logistic regressions for various BP weights and deltas for midprice
from sklearn.linear_model import LogisticRegression, LinearRegression

def get_regression_matrix(x_name, y_name, aas, ts, shifts, df_list, fname_list, all_prices_ticks):

    scores = pd.DataFrame()
    r2s = pd.DataFrame()
    lgfits = pd.DataFrame()
    lrfits = pd.DataFrame()

    tns = [str(t)+'S' for t in ts]

    for a in aas:

        wghts = [1, round(a,3), round(a**1.5,3)]

        df_list_new = create_price_moves(df_list, fname_list, all_prices_ticks, wghts = wghts)

        for tn in tns:
            
            data_resample = resample_df(df_list_new, time_sample = tn)

            for shift in shifts:
                
                print 'weights = ' + str(wghts) + ', time interval = ' + str(tn) + ', shift = ' + str(shift)
                
                #assign data for fitting
                BP = data_resample[x_name].shift(shift)
                dm = data_resample[y_name]
                
                #deal with nans
                gt = pd.notnull(BP)
                BP = BP[gt]
                dm = dm[gt]
                
                #only model sensible price shifts (<10 ticks)
                tgt = abs(dm) < 10
                BP = BP.loc[tgt]
                dm = dm[tgt]

                #only model considerable price shifts (>2 ticks)
                tgt = abs(dm) > 2
                BP = BP.loc[tgt]
                dm = dm[tgt]
                
                #split into train/test (seed to get same result every time)
                BP_train, BP_test, dm_train, dm_test = \
                    train_test_split(BP, dm, test_size = 0.2, random_state = 1)

                #logistic regression
                lg = LogisticRegression(class_weight = 'balanced')
                lgfit = lg.fit(BP_train.values.reshape(-1, 1), dm_train>0)
                scores.loc[str(wghts), 'sh' + str(shift) + ', ' + tn] = lgfit.score(BP_test.values.reshape(-1, 1), dm_test>0)
                lgfits.loc[str(wghts), 'sh' + str(shift) + ', ' + tn] = lgfit
                
                #linear regression
                lr = LinearRegression()
                lrfit = lr.fit(BP_train.values.reshape(-1, 1), dm_train)
                r2s.loc[str(wghts), 'sh' + str(shift) + ', ' + tn] = lrfit.score(BP_test.values.reshape(-1, 1), dm_test)
                lrfits.loc[str(wghts), 'sh' + str(shift) + ', ' + tn] = lrfit
                
    return scores, r2s, lgfits, lrfits


### We now calculate scores and R2 for logistic and linear regression respectively for a matrix of values for our 3 parameters, for midprice, micropriceA and micropriceB

In [None]:
aas = np.arange(0, 0.6, 0.1)
ts = [1,2,5,10,15]
shifts = range(0,3)

#mean bp
scores_bp_mipA, r2s_bp_mipA, lgfits_bp_mipA, lrfits_bp_mipA = \
    get_regression_matrix('bp', 'mipA_move', aas, ts, shifts, df_list, fname_list, all_prices_ticks)
    
scores_bp_mipB, r2s_bp_mipB, lgfits_bp_mipB, lrfits_bp_mipB = \
    get_regression_matrix('bp', 'mipB_move', aas, ts, shifts, df_list, fname_list, all_prices_ticks)
    
scores_bp_mdp, r2s_bp_mdp, lgfits_bp_mdp, lrfits_bp_mdp = \
    get_regression_matrix('bp', 'mdp_move', aas, ts, shifts, df_list, fname_list, all_prices_ticks)

In [None]:
#delta bp
scores_bpmove_mipA, r2s_bpmove_mipA, lgfits_bpmove_mipA, lrfits_bpmove_mipA = \
    get_regression_matrix('bp_move', 'mipA_move', aas, ts, shifts, df_list, fname_list, all_prices_ticks)
    
scores_bpmove_mipB, r2s_bpmove_mipB, lgfits_bpmove_mipB, lrfits_bpmove_mipB = \
    get_regression_matrix('bp_move', 'mipB_move', aas, ts, shifts, df_list, fname_list, all_prices_ticks)
    
scores_bpmove_mdp, r2s_bpmove_mdp, lgfits_bpmove_mdp, lrfits_bpmove_mdp = \
    get_regression_matrix('bp_move', 'mdp_move', aas, ts, shifts, df_list, fname_list, all_prices_ticks)

In [None]:
#median bp
scores_bpmed_mipA, r2s_bpmed_mipA, lgfits_bpmed_mipA, lrfits_bpmed_mipA = \
    get_regression_matrix('bp_med', 'mipA_move', aas, ts, shifts, df_list, fname_list, all_prices_ticks)
    
scores_bpmed_mipB, r2s_bpmed_mipB, lgfits_bpmed_mipB, lrfits_bpmed_mipB = \
    get_regression_matrix('bp_med', 'mipB_move', aas, ts, shifts, df_list, fname_list, all_prices_ticks)
    
scores_bpmed_mdp, r2s_bpmed_mdp, lgfits_bpmed_mdp, lrfits_bpmed_mdp = \
    get_regression_matrix('bp_med', 'mdp_move', aas, ts, shifts, df_list, fname_list, all_prices_ticks)

In [None]:
#plot heatmap of logistic regression scores

def plot_heatmaps(scores, r2s):
    
    f, ax = plt.subplots(nrows = 3, ncols = 2, figsize=(15,17))
    #f.tight_layout()
    plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.44, hspace=0.6)

    ########################### scores ###################################
    gt = [col for col in scores.columns if ('sh1' in col) or ('sh2' in col)]
    sns.heatmap(scores.loc[:,gt], ax=ax[0][0])#, cmap = 'plasma', vmin=vmn, vmax=vmx)
    vmn = ax[0][0].collections[0].colorbar.vmin
    vmx = ax[0][0].collections[0].colorbar.vmax
    
    gt = [col for col in scores.columns if 'sh1' in col]
    sns.heatmap(scores.loc[:,gt], ax=ax[1][0], vmin=vmn, vmax=vmx)
    
    gt = [col for col in scores.columns if 'sh2' in col]
    sns.heatmap(scores.loc[:,gt], ax=ax[2][0], vmin=vmn, vmax=vmx)

    
    ############################ r2s ###################################
    gt = [col for col in r2s.columns if ('sh1' in col) or ('sh2' in col)]
    sns.heatmap(r2s.loc[:,gt], ax=ax[0][1])
    vmn = ax[0][1].collections[0].colorbar.vmin
    vmx = ax[0][1].collections[0].colorbar.vmax
    
    gt = [col for col in r2s.columns if 'sh1' in col]
    sns.heatmap(r2s.loc[:,gt], ax=ax[1][1], vmin=vmn, vmax=vmx)
    
    gt = [col for col in scores.columns if 'sh2' in col]
    sns.heatmap(r2s.loc[:,gt], ax=ax[2][1], vmin=vmn, vmax=vmx)





#### 1) Midprice

We see that constructing book pressure using only first level volumes (i.e. weights of [1,0,0]) produces the best results. 

In general, the shift1 predictions (i.e. using book pressure at current time interval to predict midprice movement one interval ahead) are more accurate than the shift2 (predicting two intervals ahead). This is especially clear when the size of the time intervals increases from 1s to 2s. 

When we use the change in book pressure to predict price movements, the smallest the time interval the best the accuracy. Note that the delta in book pressure is a much better predictor than the absolute bp (logistic regression scores ~0.66 compared to ~0.54).



In [None]:
#midprice
plot_heatmaps(scores_bp_mdp, r2s_bp_mdp)


In [None]:
#using delta book pressure as the predictor
plot_heatmaps(scores_bpmove_mdp, r2s_bpmove_mdp)


As a conclusion, for midprice prediction we will use a 1sec time interval, and combine the bp_move from the last 3 intervals as well as the absolute bp value from the last 2 intervals as a starting point for our fit.

As a secondary fit, we will use a 2sec interval with the last 2 bp_move values and the last bp value.

#### 2) Microprice A

$mipA = (bp1\cdot bv1 + lp1\cdot lv1)/(lv1 + bv1)$


In [None]:
#micropriceA
plot_heatmaps(scores_bp_mipA, r2s_bp_mipA)

In [None]:
plot_heatmaps(scores_bpmove_mipA, r2s_bpmove_mipA)

#### 3) Microprice B

$mipB = (bp1\cdot lv1 + lp1\cdot bv1)/(lv1 + bv1)$


In [None]:
#micropriceB
plot_heatmaps(scores_bp_mipB, r2s_bp_mipB)

In [None]:
plot_heatmaps(scores_bpmove_mipB, r2s_bpmove_mipB)

### Selected fits

#### 1) Midprice fits

a) time interval = 1sec, weightings = [1,0,0], bp_move from 3 previous timesteps and bp from 2 previous

In [None]:
df_list_new = create_price_moves(df_list, fname_list, all_prices_ticks, wghts = [1,0,0])            
data_resample = resample_df(df_list_new, time_sample = '1S')


In [None]:
f,a = plt.subplots(figsize=(10,7))
gt = abs(data_resample['bp_move']) > 0.1
plt.hist(data_resample.loc[gt,'bp_move'], bins=10);
#a.set_xlim([-10,10])

In [None]:
#assign x data
all_x = [data_resample['bp_move'].shift(1), data_resample['bp_move'].shift(2), data_resample['bp_move'].shift(3),\
          data_resample['bp'].shift(1), data_resample['bp'].shift(2)]
#all_x = [data_resample['bp_move'].shift(1), data_resample['bp'].shift(1)]
#all_x = [data_resample['bp_move'].shift(1)]
BP = pd.concat(all_x, axis = 1)

#assign y
dm = data_resample.loc[:,'mdp_move']

#deal with nans
gt = pd.notnull(BP)
BP = BP[gt.all(axis = 1)]
dm = dm[gt.all(axis = 1)]

#only model sensible price shifts (<10 ticks)
tgt = abs(dm) < 100
BP = BP.loc[tgt,:]
dm = dm[tgt]

#only model considerable price shifts (>2 ticks) 
#TODO: what does this mean when we use the classifier in practice though? 
#we will end up always predicting a price change, which is not brilliant
#if we include 0 move though we cannot balance the classes
#we could use something else to assess whether a change is likely to happen and combine?
tgt = abs(dm) > 0.01
BP = BP.loc[tgt,:]
dm = dm[tgt]

#remove bp_move too small
#gt = abs(BP.iloc[:,0]) > 0.1
#BP = BP.loc[gt,:]
#dm = dm[gt]

clas = [-100,-3, -.1, .1, 3, 100]
clas = [-100, -0.001, 0.001, 100]

clas_lab = range(-len(clas)/2+1, len(clas)/2)
dm_cat = pd.cut(dm, clas, labels = clas_lab)

#deal with nans
gt = pd.notnull(dm_cat)
BP = BP.loc[gt,:]
dm_cat = dm_cat[gt]
dm = dm[gt]

#split into train/test (seed to get same result every time)
BP_train, BP_test, dm_cat_train, dm_cat_test, dm_train, dm_test = \
    train_test_split(BP, dm_cat, dm, test_size = 0.2, random_state = 1)



In [None]:
plt.hist(dm_train)

In [None]:
#from sklearn.feature_selection import f_regression, f_classif

#chi2, pval = f_regression(BP_train, dm_train)

#print chi2
#print pval
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    #    print("Normalized confusion matrix")
    #else:
    #    print('Confusion matrix, without normalization')

    #print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


def show_confusion_matrix(y_test, y_pred, class_names):    
    # Compute confusion matrix
    cnf_matrix = confusion_matrix(y_test, y_pred)
    np.set_printoptions(precision=2)

    # Plot non-normalized confusion matrix
    #plt.figure()
    #plot_confusion_matrix(cnf_matrix, classes = class_names,
    #                      title='Confusion matrix, without normalization')

    # Plot normalized confusion matrix
    #plt.figure()
    #plot_confusion_matrix(cnf_matrix, classes = class_names, normalize=True,
    #                      title='Normalized confusion matrix')

    plt.show()

    


In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.svm import LinearSVC
#xr = np.arange(-2,2,0.01)
#yr = np.arange(-1,1,0.01)

#logistic regression
lg = LogisticRegression(class_weight = 'balanced')
#lg = LogisticRegression(class_weight = dict(zip(np.unique(dm_cat_train), [1,0.06,1])))

print 'logistic, balanced, pred: bp_move'
x_train = BP_train.iloc[:,0].values.reshape(-1, 1)
x_test  = BP_test.iloc[:,0].values.reshape(-1, 1)
lgfit = lg.fit(x_train, dm_cat_train)
print classification_report(dm_cat_test, lgfit.predict(x_test)) 
show_confusion_matrix(dm_cat_test, lgfit.predict(x_test), np.unique(dm_cat_train)) 
#plt.plot(xr, (-lgfit.intercept_/lgfit.coef_[0])*np.ones([len(xr),1]))

print 'logistic, balanced, pred: bp_move>0'
x_train = BP_train.iloc[:,0].values.reshape(-1, 1)>0
x_test  = BP_test.iloc[:,0].values.reshape(-1, 1)>0
lgfit = lg.fit(x_train, dm_cat_train)
print classification_report(dm_cat_test, lgfit.predict(x_test))   
show_confusion_matrix(dm_cat_test, lgfit.predict(x_test), np.unique(dm_cat_train)) 


print 'logistic, balanced, pred: bp'
x_train = BP_train.iloc[:,3].values.reshape(-1, 1)
x_test = BP_test.iloc[:,3].values.reshape(-1, 1)
lgfit = lg.fit(x_train, dm_cat_train)
print classification_report(dm_cat_test, lgfit.predict(x_test))
show_confusion_matrix(dm_cat_test, lgfit.predict(x_test), np.unique(dm_cat_train)) 


#plt.plot((-lgfit.intercept_/lgfit.coef_[0] )*np.ones([len(yr),1]), yr, 'r')

print 'logistic, balanced, pred: bp_move, bp'
x_train = BP_train.iloc[:,[0,3]]
x_test  = BP_test.iloc[:,[0,3]]
lgfit = lg.fit(x_train, dm_cat_train)
print classification_report(dm_cat_test, lgfit.predict(x_test)) 
show_confusion_matrix(dm_cat_test, lgfit.predict(x_test), np.unique(dm_cat_train)) 


print 'logistic, balanced, pred: bp_move>0, bp'
x_train = pd.concat([BP_train.iloc[:,0]>0, BP_train.iloc[:,3]], axis=1)
x_test  = pd.concat([BP_test.iloc[:,0]>0, BP_test.iloc[:,3]], axis=1)
lgfit = lg.fit(x_train, dm_cat_train)
print classification_report(dm_cat_test, lgfit.predict(x_test))  
show_confusion_matrix(dm_cat_test, lgfit.predict(x_test), np.unique(dm_cat_train)) 


#plt.plot(xr, (-lgfit.intercept_[0] - lgfit.coef_[0][0]*xr)/lgfit.coef_[0][1], 'g')

print 'SVC, balanced, pred: bp_move, bp'
lsvc = LinearSVC(class_weight = 'balanced')
x_train = BP_train.iloc[:,[0,3]]
x_test  = BP_test.iloc[:,[0,3]]
lsvcfit = lsvc.fit(x_train, dm_cat_train)
print classification_report(dm_cat_test, lsvcfit.predict(x_test))  
show_confusion_matrix(dm_cat_test, lsvcfit.predict(x_test), np.unique(dm_cat_train)) 



Basically, everything gives us similar answers (when similarly balanced), and all methods struggle to distinguish within different negative and positive classes.

In [None]:
f,ax = plt.subplots(figsize=(12,10))
cols = ['b', 'r', 'g', 'y', 'm']
labels = ['bp_move_sh1', 'bp_move_sh2', 'bp_move_sh3', 'bp_sh1', 'bp_sh2']
for i in range(0,3):

    sns.regplot(BP.iloc[:,i], dm, x_bins = 20, color = cols[i], ax = ax, label = labels[i], order=3)
ax.legend()

f,ax = plt.subplots(figsize=(12,10))
cols = ['b', 'r', 'g', 'y', 'm']
labels = ['bp_move_sh1', 'bp_move_sh2', 'bp_move_sh3', 'bp_sh1', 'bp_sh2']
for i in range(3,5):

    sns.regplot(BP.iloc[:,i], dm, x_bins = 20, color = cols[i], ax = ax, label = labels[i], order=3)
ax.legend()

In [None]:
xy = pd.concat([BP.iloc[:,[0,3]], pd.cut(dm,2)], axis = 1)

sns.lmplot(x = 'bp_move', y = 'bp', hue = 'mdp_move', data = xy.sample(frac=0.1), scatter = True, size=7, aspect = 2)


sns.lmplot(x = 'bp_move', y = 'bp', hue = 'mdp_move', x_bins = 40, data = xy.sample(frac=1), size=7, aspect = 2)

sns.lmplot(x = 'bp', y = 'bp_move', hue = 'mdp_move', data = xy.sample(frac=0.1), scatter = True, size=7, aspect = 2)


sns.lmplot(x = 'bp', y = 'bp_move', hue = 'mdp_move', x_bins = 40, data = xy.sample(frac=1), size=7, aspect = 2)


In [None]:
xy = pd.concat([BP.iloc[:,[0,3]], pd.cut(dm,6)], axis = 1)

sns.lmplot(x = 'bp_move', y = 'bp', hue = 'mdp_move', data = xy.sample(frac=0.1), scatter = True, size=7, aspect = 2)


sns.lmplot(x = 'bp_move', y = 'bp', hue = 'mdp_move', x_bins = 40, data = xy.sample(frac=1), size=7, aspect = 2)

sns.lmplot(x = 'bp', y = 'bp_move', hue = 'mdp_move', data = xy.sample(frac=0.1), scatter = True, size=7, aspect = 2)


sns.lmplot(x = 'bp', y = 'bp_move', hue = 'mdp_move', x_bins = 40, data = xy.sample(frac=1), size=7, aspect = 2)


In [None]:
gt = (abs(BP.iloc[:,0]) > -0.001) 


x1 = BP.iloc[:,0]
x = pd.concat([x1, BP.iloc[:,3]], axis=1)


y = pd.cut(dm.loc[gt],6)
xy = pd.concat([x.loc[gt,:], y], axis = 1)

sns.jointplot(x='bp_move', y='bp', data=xy)