In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
from sklearn import mixture
from sklearn import metrics
import sys
from scipy import stats
%matplotlib inline

In [2]:
def prepareTrainSetRec():
    '''Each time the train data are loaded, all the features are calculated as the test set
    '''
    data = pd.read_csv('labeled_sina.csv')
    #import pdb;pdb.set_trace()
    dp = DataPrep()
    allorder = dp.computeEWAVBackward(data)
    hmmdata = dp.HMMPrep(allorder.copy())
    
    hmmdata['state']=0
    hmmdata.loc[(hmmdata['side']=='B')&(hmmdata['IsSpoof']==False),'state'] = 0
    hmmdata.loc[(hmmdata['side']=='S')&(hmmdata['IsSpoof']==False),'state'] = 1
    hmmdata.loc[(hmmdata['side']=='B')&(hmmdata['IsSpoof']==True),'state'] = 2
    hmmdata.loc[(hmmdata['side']=='S')&(hmmdata['IsSpoof']==True),'state'] = 3
    return hmmdata

In [3]:
def toMS(x):
    return ((x.hour*60+x.minute)*60+x.second)*1000000+x.microsecond
def timeDelta2MS(x):
    return (((x.hours*60+x.minutes)*60+x.seconds)*1000+x.milliseconds)*1000+x.microseconds
    
class DataPrep:
    ''' The parameters:
    1. isLean: whether we are dealing with data with more information. isLean=False, we have a richer data like the Training set. isLean=True, we have a lean data --only brief summary for each order
    2. decay_factor: a multiple of T_M (median trading gap)
    3. linger_factor: this is the multiple of the median trading gap. With this parameter, we will ignore all ordrers placed linger_factor*T_M ago
    '''
    def __init__(self,isLean=False,linger_factor = 40,decay_factor=5):        
        self.isLean = isLean
        self.linger_factor = linger_factor
        self.decay_factor = decay_factor
        self.medianT = 0 #the median of the trade interval
        
    def processDatafile(self,filename):
        data = pd.read_csv(filename)
        return self.processData(data)
    
    def processData(self,data,verbose=1):
        #import pdb;pdb.set_trace()
        if verbose>0:
            print '----Data cleaning----'
        if self.isLean:            
            allorder = self.cleanDataLean(data)
        else:
            allorder = self.cleanData(data)
    
        data = self.prepare(allorder)
        
        if verbose>0:
            print '---- Feature calculation----'
        #data = self.computeEWAVForward(data)
        allorder = self.computeEWAVBackward(data)
        #data = self.computeSEV(data)
        
        if verbose>0:
            print '----- Prepare for HMM------'
        data = self.HMMPrep(allorder.copy())
        
        return allorder,data
    
    def computeEWAVBackward(self,data):
        
        if len(data)<2:
            raise ValueError('data too short')
        
        #import pdb;pdb.set_trace()
        data['time diff'] = data['time diff'].fillna(24*3600*1000000)
        self.medianT = np.median(data['time diff'])
        T = self.medianT*self.decay_factor
        linger = self.medianT*self.linger_factor
        epsilon = sys.float_info.epsilon
        
        data['ewav_back canc buy'] = epsilon
        data['ewav_back canc sell'] = epsilon
        data['ewav_back exec buy'] = epsilon
        data['ewav_back exec sell'] = epsilon
    
        for ii in range(1,len(data)):
            coef = math.exp(-data.ix[ii]['time diff']/T) if data.ix[ii]['time diff']<=linger else 0
            data.loc[ii,'ewav_back canc buy'] = data.loc[ii,'cancelled buy']+data.loc[ii-1,'ewav_back canc buy']*coef
            data.loc[ii,'ewav_back canc sell'] = data.loc[ii,'cancelled sell']+data.loc[ii-1,'ewav_back canc sell']*coef
            data.loc[ii,'ewav_back exec buy'] = data.loc[ii,'exec buy']+data.loc[ii-1,'ewav_back exec buy']*coef
            data.loc[ii,'ewav_back exec sell'] = data.loc[ii,'exec sell']+data.loc[ii-1,'ewav_back exec sell']*coef
        ff = lambda x:x if x>epsilon else epsilon
        data['ewav_back canc buy'] = data['ewav_back canc buy'].map(ff)
        data['ewav_back canc sell'] = data['ewav_back canc sell'].map(ff)
        data['ewav_back exec buy'] = data['ewav_back exec buy'].map(ff)
        data['ewav_back exec sell'] = data['ewav_back exec sell'].map(ff)
      
        data['ewav_back buy/sell'] = data['ewav_back canc buy']/data['ewav_back canc sell'] 
        data['log ewav_back buy/sell'] = data['ewav_back buy/sell'].map(math.log)
        data['ewav_back sell/buy'] = data['ewav_back canc sell']/data['ewav_back canc buy']
                
        data['ewav_back buy exec+canc'] = data['ewav_back exec buy'] + data['ewav_back canc buy']
        data['ewav_back buy exec/total']=  data['ewav_back exec buy']/data['ewav_back buy exec+canc']       
       
        data['ewav_back sell exec+canc'] = data['ewav_back exec sell'] + data['ewav_back canc sell']
        data['ewav_back sell exec/total'] = data['ewav_back exec sell']/data['ewav_back sell exec+canc']
    
        return data
    
    def cleanDataLean(self,data):
        #data['q_exec'].fillna(0,inplace=True)
        data['q_exec'].fillna(0,inplace=True)
        data['execution_time'] = data['execution_time'].map(lambda x:pd.to_datetime(x))
        data['cancel_entry_time'] = data['cancel_entry_time'].map(lambda x:pd.to_datetime(x))
        data['order_entry_time'] = data['order_entry_time'].map(lambda x:pd.to_datetime(x))
        
        allorder = data
        allorder['prc*qty'] = allorder['avg_prc']        
        allorder['execution_time_last_ms'] = allorder['execution_time'].map(toMS)
        allorder['order_entry_time_ms'] = allorder['order_entry_time'].map(toMS)
        
        allorder['q_cancel'] = allorder['q_new'] - allorder['q_exec']
        allorder.set_index('orderid',inplace=True)
        allorder = allorder.sort('order_entry_time')
        return allorder
    
    def cleanData(self,data):
        data['q_exec'].fillna(0,inplace=True)
        data['execution_time'] = data['execution_time'].map(lambda x:pd.to_datetime(x))
        data['cancel_entry_time'] = data['cancel_entry_time'].map(lambda x:pd.to_datetime(x))
        data['order_entry_time'] = data['order_entry_time'].map(lambda x:pd.to_datetime(x))
        data['prc*qty'] = data['q_exec']*data['prc_exec']

        neworder = data.loc[data['order_type']=='NEW ORDER',:]
        exeorder = data.loc[data['order_type']=='EXECUTION',:]
        canorder = data.loc[data['order_type']=='CANCEL',:].copy()
    
        ############## Exclude those partial filled orders from cancel list
        #partialfill = set(canorder['orderid']).intersection(set(exeorder['orderid']))
        #canorder = canorder.loc[canorder['orderid'].isin(partialfill)==False,:]
        #####################################################################
   
        allorder = neworder[['id','orderid','symbol','q_new','price','order_entry_time','date','time','side']].set_index('orderid')
        gp = exeorder.groupby('orderid')
        tmp = gp.agg({'q_exec':np.sum,'prc*qty':np.sum})
        tmp['avg exe_prc'] = tmp['prc*qty']/tmp['q_exec']
        del tmp['prc*qty']
        allorder = allorder.join(tmp)
    #allorder = allorder.join(gp['execute_time'].agg({'first_exe_time':np.min,'last_exe_time':np.max}))
        allorder = allorder.join(gp['execution_time'].agg({'first_execution_time':np.min,'last_execution_time':np.max}))
        allorder['execution_time_first_ms'] = allorder['first_execution_time'].map(toMS)
        allorder['execution_time_last_ms'] = allorder['last_execution_time'].map(toMS)
        allorder['order_entry_time_ms'] = allorder['order_entry_time'].map(toMS)
    #gp = canorder.groupby('orderid')
        allorder = allorder.join(canorder.set_index('orderid')[['cancel_entry_time','canc_time']])
        allorder['q_exec'].fillna(0,inplace=True)
        allorder['q_cancel'] = allorder['q_new'] - allorder['q_exec']
        allorder = allorder.sort('order_entry_time')
        return allorder
    
    def prepare(self,allorder):
        ''' resort all the order according the the order entry time (canceled order) and exe time(filled order)
            Calculate the time difference
        '''
        fillorder = allorder.loc[allorder['q_exec']>0,['date','price','side','last_execution_time','execution_time_last_ms','q_exec']]
        fillorder['exec buy'] = fillorder['q_exec']
        fillorder['exec sell'] = fillorder['q_exec']
        fillorder.loc[fillorder['side']=='B','exec sell'] = 0
        fillorder.loc[fillorder['side']!='B','exec buy'] = 0
        fillorder = fillorder.rename(columns={'execution_time_last_ms':'microsecond','last_execution_time':'time'})

    #canorder = allorder.loc[allorder['q_cancel']>0,['date','price','side','order_entry_time','order_entry_time_ms','q_cancel']]
        canorder = allorder.loc[allorder['q_cancel']==allorder['q_new'],['date','price','side','order_entry_time','order_entry_time_ms','q_cancel']]
    #partially filled order discarded
    #import pdb;pdb.set_trace()
        canorder['cancelled buy'] = canorder['q_cancel']
        canorder['cancelled sell'] = canorder['q_cancel']
        canorder.loc[canorder['side']=='B','cancelled sell'] = 0.0
        canorder.loc[canorder['side']!='B','cancelled buy'] = 0.0
        canorder = canorder.rename(columns={'order_entry_time_ms':'microsecond','order_entry_time':'time'})

        fillorder['cancelled buy'] = 0
        fillorder['cancelled sell'] = 0
        canorder['exec buy'] = 0
        canorder['exec sell'] = 0
        del canorder['q_cancel']
        del fillorder['q_exec']
        data = fillorder.append(canorder)
        data = data.sort(['date','microsecond'])
        data = data.reset_index()
        #import pdb;pdb.set_trace()
        for dd in data['date'].unique():
            data.loc[data['date']==dd,'inventory'] = data.loc[data['date']==dd,'exec buy']-data.loc[data['date']==dd,'exec sell']
            data.loc[data['date']==dd,'inventory'] = data.loc[data['date']==dd,'inventory'].cumsum()
            data.loc[data['date']==dd,'time diff']= data.loc[data['date']==dd,'microsecond'].diff()*1. 
        data['time diff'] = data['time diff'].fillna(24*3600*1000000)
        return data
    
    def HMMPrep(self,df):
        #import pdb;pdb.set_trace()
        col = ['orderid','cancelled buy','exec sell','cancelled sell','exec buy','microsecond','price','side','time','date','inventory','time diff',
         'ewav_back canc buy','ewav_back canc sell','ewav_back exec buy','ewav_back exec sell','ewav_back buy/sell','ewav_back sell/buy']
        if 'IsSpoof' in df.columns:
            col +=['IsSpoof']
        df = df[col]
        del df['ewav_back exec buy']
        del df['ewav_back exec sell']
        # clean the data for ewav_back canc buy/sell and sell/buy
        # buy/sell will be just inverse of sell/buy, so we use one column buy/sell
        df.loc[(df['ewav_back canc buy']<1e-5)&(df['ewav_back canc sell']<1e-5),'ewav_back buy/sell']=1
        medianbs = df.loc[(df['ewav_back buy/sell']>0)&(df['ewav_back buy/sell']<np.inf),'ewav_back buy/sell'].median()
        maxbs = df.loc[(df['ewav_back buy/sell']>0)&(df['ewav_back buy/sell']<np.inf),'ewav_back buy/sell'].max()
        df.loc[df['ewav_back buy/sell']==np.inf,'ewav_back buy/sell'] = maxbs
        df.loc[df['ewav_back buy/sell']==0,'ewav_back buy/sell'] = 1/maxbs
        df.loc[:,'ewav_back buy/sell'] = df.loc[:,'ewav_back buy/sell'].map(np.log)
        
        ## Get the time difference, seems not contributing for now
        df['TimeDiff_back'] = np.nan
        df['TimeDiff_frwd'] = np.nan
        df['TimeDiff_min'] = np.nan
        #import pdb;pdb.set_trace()
        
        df = df.loc[(df['exec sell']>0)|(df['exec buy']>0),:].copy()
        if len(df)==0:
            return df
        buy = df.loc[df['side']=='B',:].copy()
        if len(buy)>0:
            for dd in buy['date'].unique():
            #import pdb;pdb.set_trace()
                tmp = buy.loc[buy['date']==dd,:]
                buy.loc[buy['date']==dd,'TimeDiff_back'] = buy.loc[buy['date']==dd,'microsecond'].diff(1).map(lambda x:np.abs(x))
                buy.loc[buy['date']==dd,'TimeDiff_frwd'] = buy.loc[buy['date']==dd,'microsecond'].diff(-1).map(lambda x:np.abs(x))
            #import pdb;pdb.set_trace()    
            buy['TimeDiff_frwd'].fillna(buy['TimeDiff_frwd'].max(),inplace=True)    
            buy['TimeDiff_back'].fillna(buy['TimeDiff_back'].max(),inplace=True)
            buy['TimeDiff_min'] = buy.apply(lambda x:min(x['TimeDiff_back'],x['TimeDiff_frwd']),axis=1)

        sell = df.loc[df['side']=='S',:].copy()
        if len(sell)>0:
            for dd in sell['date'].unique():
                tmp = sell.loc[sell['date']==dd,:]
                sell.loc[sell['date']==dd,'TimeDiff_back'] = sell.loc[sell['date']==dd,'microsecond'].diff(1).map(lambda x:np.abs(x))
                sell.loc[sell['date']==dd,'TimeDiff_frwd'] = sell.loc[sell['date']==dd,'microsecond'].diff(-1).map(lambda x:np.abs(x))
    
            sell['TimeDiff_frwd'].fillna(sell['TimeDiff_frwd'].max(),inplace=True)
            sell['TimeDiff_back'].fillna(sell['TimeDiff_back'].max(),inplace=True)
            sell['TimeDiff_min'] = sell.apply(lambda x:min(x['TimeDiff_back'],x['TimeDiff_frwd']),axis=1)

        newdf = buy.append(sell)
        newdf['date'] = newdf['date'].map(lambda x:pd.to_datetime(x))
        #newdf = newdf.sort(['date','microsecond'])
        df = newdf.sort()
        
        return df

In [4]:
from sklearn.ensemble import RandomForestClassifier
class RFModel:
    def __init__(self,n_estimators,max_depth):
        self.rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
        self.label_map = {}
        self.rev_map = {}
        
        #self.label_set = label_set
        
    def fit(self,x1,label1,x2,label2):
        ''' we assume x1,x2 are numpy arrays (1-d)
        '''
        label = np.array([0]*len(x1)+[1]*len(x2))
        self.label_map = {0:label1,1:label2}
        self.rev_map ={label1:0,label2:1}
        data = np.concatenate((x1,x2)).reshape((len(x1)+len(x2),1))
        self.rf.fit(data,label)      
        if False:
            self.showResult(x1,label1,x2,label2)
    
    def showResult(self,x1,label1,x2,label2):
        #import pdb;pdb.set_trace()
        plt.hist(np.array(x1),bins=100,alpha=0.5,normed=True)
        plt.hist(np.array(x2),bins=100,alpha=0.5,normed=True)
        tt = np.arange(-50,50,0.05)
        tt = tt.reshape((len(tt),1))
        proba = self.rf.predict_proba(tt)
        plt.plot(tt,proba[:,0],color='b')
        plt.plot(tt,proba[:,1],color='r')
        plt.show()
        
    def score(self,x,label):
        ''' give score in log prob for the class denoted by label
        '''
        proba = self.rf.predict_proba(np.array(x).reshape((len(x),1)))
        return np.log(proba[:,self.rev_map[label]])
    
    def prob(self,x,label):
        ''' give score in log prob for the class denoted by label
        '''
        proba = self.rf.predict_proba(np.array(x).reshape((len(x),1)))
        return proba[:,self.rev_map[label]]
    
    
class RFWrapper():
    def __init__(self,rf,label):
        self.rf = rf
        self.label= label
    def score(self,x):
        return self.rf.score(x,self.label)
    def prob(self,x):
        return self.rf.prob(x,self.label)

In [5]:
class HMM:
    def __init__(self,nState,TDFeaSet,featureSet,useAllFea,useDPGMM=True):
        '''
        recommended value for featureSet=['ewav_back buy/sell']
        '''
        self.TDFeaSet = TDFeaSet
        self.featureSet = featureSet
        self.useDPGMM = useDPGMM
        self.useAllFea = useAllFea
        #self.df = data
        self.nState = nState
        self.tp = None
        self.pi = None
        self.TDmodel = []
        self.RatioModel = []
    
    def train(self,df,show=False):
        self.pi = np.array(df.groupby('state').size()*1.0/len(df))
        
        df['next state'] = df['state'].shift(-1)    

        gp = df.groupby(['state','next state','date']).size()
        aa = gp.sum(level=[0,1])
        bb = gp.sum(level=0)*1.
        self.tp = aa/bb

        print '---- Transition prob:'
        print self.tp
        
        self.RatioModel = []
        
        ### RF model for ratio #############
        
        ratio0 = df.loc[df['state']==0,self.featureSet]
        ratio2 = df.loc[df['state']==2,self.featureSet]
        rf_buy = RFModel(n_estimators=10,max_depth=2)
        rf_buy.fit(ratio0,0,ratio2,2)
        
        ratio1 = df.loc[df['state']==1,self.featureSet]
        ratio3 = df.loc[df['state']==3,self.featureSet]
        rf_sell = RFModel(n_estimators=10,max_depth=2)
        rf_sell.fit(ratio1,1,ratio3,3)
        
        self.RatioModel=[RFWrapper(rf_buy,0),RFWrapper(rf_sell,1),RFWrapper(rf_buy,2),RFWrapper(rf_sell,3)]            
    
    def stateProb(self,obs):
        '''Give the estimate of the probablity of each state at each time instance
        '''
        rtlist = []
    
        nState = self.nState
        RatioModel = self.RatioModel
        
        for rt in RatioModel:    
            rtlist.append(list(rt.score(np.array(obs[self.featureSet])))) #low efficiency code
        
        rtprob = np.asmatrix(rtlist)

        distrprob = rtprob
        
        logtp = np.log(self.tp)
        logpi = np.log(self.pi)
    
        alpha = np.zeros((nState,len(obs)))
        beta = np.zeros((nState,len(obs)))
    
        isbuy = obs['side'].map(lambda x:int(x=='B'))
        issell = obs['side'].map(lambda x:int(x=='S'))
        validState = np.asmatrix([isbuy,issell,isbuy,issell]) # 0 means not valid
        dumb = -1e5 #used to fill for np.log(zero)
    
        alpha[:,0] = np.squeeze(np.asarray(distrprob[:,0])) + logpi
        for ii in range(1,len(obs)):
            for kk in range(nState):
                if validState[kk,ii]==0:
                    alpha[kk,ii] = dumb
                else:
                    tmp = alpha[:,ii-1] + logtp[:,kk]
                    maxtmp = np.max(tmp)
                    tmp = tmp - maxtmp
                    alpha[kk,ii] = maxtmp + np.log(np.sum(np.exp(tmp))) + distrprob[kk,ii]
        
        for ii in range(len(obs)-2,-1,-1):
            for kk in range(nState):
                if validState[kk,ii] == 0:
                    beta[kk,ii] = dumb
                else:
                    tmp = np.asarray(logtp[kk])+beta[:,ii+1]+np.squeeze(np.asarray(distrprob[:,ii+1]))
                    maxtmp = np.max(tmp)
                    tmp = tmp - maxtmp
                    beta[kk,ii] = maxtmp + np.log(np.sum(np.exp(tmp)))
            
        gamma = alpha+beta # not exactly the gamma
        maxgamma = np.max(gamma,0)
        gamma = gamma - np.kron(np.reshape(maxgamma,(1,len(obs))),np.ones((nState,1)))
        gamma = np.exp(gamma)
        sumgamma = np.kron(np.sum(gamma,0),np.ones((nState,1)))
        gamma = gamma/sumgamma   
        return gamma
    
    def predict(self,df):
        ''' needs more work,better return a dataframe
        '''
        #import pdb;pdb.set_trace()
        data = df.copy()
        prob = self.stateProb(data)
        pred = np.argmax(prob,0)
        pred_prob=np.max(prob,0)
        data['pred'] = pred
        data['pred_prob'] = pred_prob
        data['predSpoofing'] = data['pred'].map(lambda x:x>1)
        return data

In [21]:
def process_dera(df):
    df.rename(columns={'order_time':'order_entry_time','order_shares':'q_new','exec_shares':'q_exec','exec_last_time':'execution_time','cxl_time':'cancel_entry_time'},inplace=True)
    df['q_canc'] = 0.0
    no = df.copy()
    no['order_type'] = 'NEW ORDER'
    eo = df[df['fill_rate']>0] 
    eo['order_type'] = 'EXECUTION'
    co = df[df['fill_rate']!=1] 
    co['order_type'] = 'CANCEL'
    co['q_canc'] = co['q_new'] * (1 - co['fill_rate'])
    ret = pd.concat([no,eo,co])
    ret['avg_prc'] = 0.0
    ret.sort_values(['orderid','order_type'],ascending = [1,0],inplace=True)
    return ret    

In [7]:
traindf = prepareTrainSetRec()   
mm = HMM(nState=4,TDFeaSet=['TimeDiff_frwd','TimeDiff_back'],featureSet=['ewav_back buy/sell'],useAllFea=False,useDPGMM=True)
mm.train(traindf)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-

---- Transition prob:
state  next state
0      0.0           0.636364
       1.0           0.227273
       2.0           0.027273
       3.0           0.109091
1      0.0           0.208633
       1.0           0.625899
       2.0           0.107914
       3.0           0.057554
2      0.0           0.094340
       1.0           0.245283
       2.0           0.622642
       3.0           0.037736
3      0.0           0.050360
       1.0           0.100719
       2.0           0.007194
       3.0           0.841727
dtype: float64


In [22]:
df = pd.read_csv('sum.csv')

In [23]:
dp = DataPrep(True)
res = pd.DataFrame()
total_spoof = 0
total_trades = 0

for sym in df['symbol'].unique():
    data = df[df['symbol']==sym]
    data = process_dera(data)
    xx,hmmdata = dp.processData(data,verbose=0)
    if len(hmmdata)>=3:
        tmp = mm.predict(hmmdata)
        cur_spoof = len(tmp[tmp['predSpoofing']])
        total_spoof += cur_spoof
        res = res.append(tmp)
        total_trades += len(hmmdata)
        print 'ID={}, number of spoofing trade = {}, total trades = {}'.format(sym,cur_spoof,len(hmmdata))
        
print 'total finding ={}'.format(total_spoof)
print 'total trades = {}'.format(total_trades)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-

ID=DELL, number of spoofing trade = 0, total trades = 2320
ID=MCP, number of spoofing trade = 0, total trades = 307
ID=VISN, number of spoofing trade = 0, total trades = 105
ID=EBAY, number of spoofing trade = 0, total trades = 123
ID=ABT, number of spoofing trade = 0, total trades = 364
ID=JDSU, number of spoofing trade = 0, total trades = 155
ID=GBDC, number of spoofing trade = 0, total trades = 109
ID=CA, number of spoofing trade = 0, total trades = 14
ID=CLSN, number of spoofing trade = 0, total trades = 985
ID=NRGM, number of spoofing trade = 0, total trades = 45
ID=WMC, number of spoofing trade = 0, total trades = 6
ID=QCOR, number of spoofing trade = 0, total trades = 89
ID=LINE, number of spoofing trade = 0, total trades = 19
ID=GSVC, number of spoofing trade = 0, total trades = 85
ID=RSX, number of spoofing trade = 0, total trades = 84
ID=CAJ, number of spoofing trade = 0, total trades = 9
ID=AFL, number of spoofing trade = 0, total trades = 9
ID=IFT, number of spoofing trade 

KeyboardInterrupt: 

In [17]:
ret = process_dera(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,date,orderid,id,order_entry_time,side,symbol,account,q_new,orderstatus,tif,q_exec,execution_time,explicit_cxl_time,fill_rate,auto_cxl_time,cancel_entry_time,hidden,q_canc,order_type,avg_prc
0,2013-01-02 00:00:00.000,4605204951,KFERGUSON,1900-01-01 07:00:58.126,S,DELL,5AO05128,1000,UR OUT,DAY,0.0,,1900-01-01 07:32:22.130,0.000000,,1900-01-01 07:32:22.130,False,0.0,NEW ORDER,0.0
0,2013-01-02 00:00:00.000,4605204951,KFERGUSON,1900-01-01 07:00:58.126,S,DELL,5AO05128,1000,UR OUT,DAY,0.0,,1900-01-01 07:32:22.130,0.000000,,1900-01-01 07:32:22.130,False,1000.0,CANCEL,0.0
1,2013-01-02 00:00:00.000,4605204952,KFERGUSON,1900-01-01 07:01:04.696,S,DELL,5AO05128,1000,UR OUT,DAY,0.0,,1900-01-01 07:28:24.261,0.000000,,1900-01-01 07:28:24.261,False,0.0,NEW ORDER,0.0
1,2013-01-02 00:00:00.000,4605204952,KFERGUSON,1900-01-01 07:01:04.696,S,DELL,5AO05128,1000,UR OUT,DAY,0.0,,1900-01-01 07:28:24.261,0.000000,,1900-01-01 07:28:24.261,False,1000.0,CANCEL,0.0
2,2013-01-02 00:00:00.000,4605204955,arizer,1900-01-01 07:01:42.863,S,MCP,5AO05134,850,UR OUT,DAY,100.0,1900-01-01 07:01:42.870,1900-01-01 07:01:45.190,0.117647,,1900-01-01 07:01:45.190,False,0.0,NEW ORDER,0.0
2,2013-01-02 00:00:00.000,4605204955,arizer,1900-01-01 07:01:42.863,S,MCP,5AO05134,850,UR OUT,DAY,100.0,1900-01-01 07:01:42.870,1900-01-01 07:01:45.190,0.117647,,1900-01-01 07:01:45.190,False,0.0,EXECUTION,0.0
2,2013-01-02 00:00:00.000,4605204955,arizer,1900-01-01 07:01:42.863,S,MCP,5AO05134,850,UR OUT,DAY,100.0,1900-01-01 07:01:42.870,1900-01-01 07:01:45.190,0.117647,,1900-01-01 07:01:45.190,False,750.0,CANCEL,0.0
3,2013-01-02 00:00:00.000,4605204958,arizer,1900-01-01 07:01:46.975,S,MCP,5AO05134,650,UR OUT,DAY,500.0,1900-01-01 07:02:06.000,1900-01-01 07:02:22.848,0.769231,,1900-01-01 07:02:22.848,False,0.0,NEW ORDER,0.0
3,2013-01-02 00:00:00.000,4605204958,arizer,1900-01-01 07:01:46.975,S,MCP,5AO05134,650,UR OUT,DAY,500.0,1900-01-01 07:02:06.000,1900-01-01 07:02:22.848,0.769231,,1900-01-01 07:02:22.848,False,0.0,EXECUTION,0.0
3,2013-01-02 00:00:00.000,4605204958,arizer,1900-01-01 07:01:46.975,S,MCP,5AO05134,650,UR OUT,DAY,500.0,1900-01-01 07:02:06.000,1900-01-01 07:02:22.848,0.769231,,1900-01-01 07:02:22.848,False,150.0,CANCEL,0.0
