In [3]:
### Importing packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Sklearn:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error

### Gradient Boosted CARTs:
import xgboost as xgb

print("--- Setup complete ---")

--- Setup complete ---


In [4]:
### Importing the training dataset
filename = 'xyz.csv'
df = pd.read_csv("filename")

### Exploratory Data Analysis of the dataset

In [None]:
df.head()

In [6]:
print("Train Dataset has {} features".format(df.shape[1]-1))
print("Train Dataset has {} rows".format(df.shape[0]))

Train Dataset has 16 features
Train Dataset has 5237980 rows


In [7]:
### Number of different stock_ids:
print("{} different stocks".format(len(df['stock_id'].unique())))
print("{} different time_ids".format(len(df['time_id'].unique())))

200 different stocks
26455 different time_ids


In [8]:
df['time_id']

0              0
1              0
2              0
3              0
4              0
           ...  
5237975    26454
5237976    26454
5237977    26454
5237978    26454
5237979    26454
Name: time_id, Length: 5237980, dtype: int64

In [9]:
df.dtypes

stock_id                     int64
date_id                      int64
seconds_in_bucket            int64
imbalance_size             float64
imbalance_buy_sell_flag      int64
reference_price            float64
matched_size               float64
far_price                  float64
near_price                 float64
bid_price                  float64
bid_size                   float64
ask_price                  float64
ask_size                   float64
wap                        float64
target                     float64
time_id                      int64
row_id                      object
dtype: object

In [10]:
### Features with NaNs:
df.isna().sum().sort_values(ascending = False)

far_price                  2894342
near_price                 2857180
ask_price                      220
imbalance_size                 220
reference_price                220
matched_size                   220
wap                            220
bid_price                      220
target                          88
time_id                          0
ask_size                         0
stock_id                         0
bid_size                         0
date_id                          0
imbalance_buy_sell_flag          0
seconds_in_bucket                0
row_id                           0
dtype: int64

In [11]:
### Pre-processing functions (1) - Dropping columns with high number of NaN:
def drop_cols(dataframe):
    id_col = dataframe['row_id']
    df = dataframe.drop(['far_price', 'near_price', 'row_id'], axis = 1)
    return id_col, df


### Drop the categorical row_id variable, and far_price,and near_price that have too many NaN values:
id_col, df_processed = drop_cols(df)

df_processed.isna().sum().sort_values(ascending = False)

imbalance_size             220
reference_price            220
matched_size               220
bid_price                  220
ask_price                  220
wap                        220
target                      88
stock_id                     0
date_id                      0
seconds_in_bucket            0
imbalance_buy_sell_flag      0
bid_size                     0
ask_size                     0
time_id                      0
dtype: int64

In [12]:
### Pre-processing functions (2) - Imputing mean values:


def imputer(df_processed):
    '''
    Function that receives a dataframe and returns the mean/mode value of each missing value in a column partitioned by stock_id,
    '''
    stock_list = list(df_processed['stock_id'].unique())

    for stock in stock_list:
        stock_df = df_processed.loc[df_processed['stock_id'] == stock]
        imp_mean_imbalance_size = SimpleImputer(missing_values=np.nan, strategy='mean') #Instantiate SimpleImputer with mean strategy
        imp_mean_reference_price = SimpleImputer(missing_values=np.nan, strategy='mean')
        imp_mean_matched_size = SimpleImputer(missing_values=np.nan, strategy='mean')
        imp_mean_bid_price = SimpleImputer(missing_values=np.nan, strategy='mean')
        imp_mean_ask_price = SimpleImputer(missing_values=np.nan, strategy='mean')
        imp_mean_wap = SimpleImputer(missing_values=np.nan, strategy='mean')
        imp_mean_target = SimpleImputer(missing_values=np.nan, strategy='mean')
        imp_mean_seconds_in_bucket = SimpleImputer(missing_values=np.nan, strategy='mean')
        imp_mean_imbalance_buy_sell_flag = SimpleImputer(missing_values=np.nan, strategy='mode')
        imp_mean_bid_size = SimpleImputer(missing_values=np.nan, strategy='mean')
        imp_mean_ask_size = SimpleImputer(missing_values=np.nan, strategy='mean')

        df_processed['imbalance_size'] = imp_mean_imbalance_size.fit_transform(np.array(df_processed['imbalance_size']).reshape(-1,1))
        df_processed['reference_price'] = imp_mean_reference_price.fit_transform(np.array(df_processed['reference_price']).reshape(-1,1))
        df_processed['matched_size'] = imp_mean_matched_size.fit_transform(np.array(df_processed['matched_size']).reshape(-1,1))
        df_processed['bid_price'] = imp_mean_bid_price.fit_transform(np.array(df_processed['bid_price']).reshape(-1,1))
        df_processed['ask_price'] = imp_mean_ask_price.fit_transform(np.array(df_processed['ask_price']).reshape(-1,1))
        df_processed['wap'] = imp_mean_wap.fit_transform(np.array(df_processed['wap']).reshape(-1,1))
        df_processed['target'] = imp_mean_target.fit_transform(np.array(df_processed['target']).reshape(-1,1))
        df_processed['seconds_in_bucket'] = imp_mean_seconds_in_bucket.fit_transform(np.array(df_processed['seconds_in_bucket']).reshape(-1,1))
        df_processed['bid_size'] = imp_mean_bid_size.fit_transform(np.array(df_processed['bid_size']).reshape(-1,1))
        df_processed['ask_size'] = imp_mean_ask_size.fit_transform(np.array(df_processed['ask_size']).reshape(-1,1))

    return df_processed, imp_mean_imbalance_size, imp_mean_reference_price, imp_mean_matched_size, imp_mean_bid_price, \
            imp_mean_ask_price, imp_mean_wap, imp_mean_seconds_in_bucket, imp_mean_bid_size, imp_mean_ask_size

df_processed, imp_mean_imbalance_size, imp_mean_reference_price, imp_mean_matched_size, imp_mean_bid_price, imp_mean_ask_price, imp_mean_wap, imp_mean_seconds_in_bucket, imp_mean_bid_size, imp_mean_ask_size = imputer(df_processed)

In [13]:
df_processed.isna().sum()

stock_id                   0
date_id                    0
seconds_in_bucket          0
imbalance_size             0
imbalance_buy_sell_flag    0
reference_price            0
matched_size               0
bid_price                  0
bid_size                   0
ask_price                  0
ask_size                   0
wap                        0
target                     0
time_id                    0
dtype: int64

In [14]:
### Function that applies the imputer to a dataframe (to be used in the testdataset):
def apply_imputer(df_processed):
    df_processed['imbalance_size'] = imp_mean_imbalance_size.transform(np.array(df_processed['imbalance_size']).reshape(-1,1))
    df_processed['reference_price'] = imp_mean_reference_price.transform(np.array(df_processed['reference_price']).reshape(-1,1))
    df_processed['matched_size'] = imp_mean_matched_size.transform(np.array(df_processed['matched_size']).reshape(-1,1))
    df_processed['bid_price'] = imp_mean_bid_price.transform(np.array(df_processed['bid_price']).reshape(-1,1))
    df_processed['ask_price'] = imp_mean_ask_price.transform(np.array(df_processed['ask_price']).reshape(-1,1))
    df_processed['wap'] = imp_mean_wap.transform(np.array(df_processed['wap']).reshape(-1,1))
    df_processed['seconds_in_bucket'] = imp_mean_seconds_in_bucket.transform(np.array(df_processed['seconds_in_bucket']).reshape(-1,1))
    df_processed['bid_size'] = imp_mean_bid_size.transform(np.array(df_processed['bid_size']).reshape(-1,1))
    df_processed['ask_size'] = imp_mean_ask_size.transform(np.array(df_processed['ask_size']).reshape(-1,1))
    
    return df_processed

In [17]:
### Starting with a simple baseline submission using Gradient Boosting:

#First just performing a simple train_test_split on the training dataset:

y = df_processed['target']
X = df_processed.drop(['target'], axis = 1)



In [18]:
#Final test dataset doesn't have time_id:
X = X.drop(['time_id'], axis = 1)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.7, random_state = 21)

In [20]:
clf = xgb.XGBRegressor(n_estimators = 20, eval_metric = 'mae')
clf

In [21]:
clf.fit(X_train, y_train)

In [22]:
X_test

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,bid_price,bid_size,ask_price,ask_size,wap
1515976,190,141,220.0,5640850.00,-1,1.001673,16034836.78,1.001673,37646.00,1.002764,14697.93,1.002458
692123,197,65,120.0,274248.60,1,0.998245,13034408.16,0.998063,32813.11,0.998426,17649.65,0.998299
3764744,164,347,30.0,4688971.00,1,1.000065,8224618.33,0.999899,14614.38,1.000065,14194.00,0.999983
926395,118,86,540.0,1393701.94,-1,1.013498,47476827.68,1.013058,672585.65,1.013832,118720.00,1.013716
579588,134,54,390.0,0.00,0,0.998859,7398750.48,0.998118,69489.40,0.998859,63969.58,0.998504
...,...,...,...,...,...,...,...,...,...,...,...,...
133916,123,12,380.0,8386855.19,-1,0.998114,52276110.75,0.998114,3457.44,0.998253,30833.12,0.998128
2838236,23,262,370.0,950411.82,-1,0.998006,24077063.05,0.998006,2399.10,0.998281,6217.56,0.998083
3671896,116,338,340.0,570489.84,-1,0.998733,15320644.93,0.998733,2214.12,0.999003,32667.12,0.998750
1349110,127,126,0.0,1642988.17,-1,0.999458,5624614.44,0.998367,41193.00,1.000670,16928.08,1.000000


In [23]:
y_pred = clf.predict(X_test)
y_pred

array([-1.7865279 , -0.60597926,  0.28450575, ...,  1.4143747 ,
       -2.8004448 , -0.9612977 ], dtype=float32)

In [24]:
mae = mean_absolute_error(y_pred, y_test)
print("Mean Absolute Error: {}".format(mae))

Mean Absolute Error: 6.32076909267466
